1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_itrans_recon_atom_intr.c 22 * 23 * @brief 24 * Contains function definitions for inverse quantization, inverse 25 * transform and reconstruction 26 * 27 * @author 28 * 100470 29 * 100592 (edited by) 30 * 31 * @par List of Functions: 32 * - ihevc_itrans_recon_4x4_ttype1_ssse3() 33 * - ihevc_itrans_recon_4x4_ssse3() 34 * - ihevc_itrans_recon_8x8_ssse3() 35 * 36 * @remarks 37 * None 38 * 39 ******************************************************************************* 40 */ 41 #include <stdio.h> 42 #include <string.h> 43 #include "ihevc_typedefs.h" 44 #include "ihevc_platform_macros.h" 45 #include "ihevc_macros.h" 46 #include "ihevc_defs.h" 47 #include "ihevc_func_selector.h" 48 #include "ihevc_trans_tables.h" 49 #include "ihevc_iquant_itrans_recon.h" 50 #include "ihevc_trans_macros.h" 51 52 53 #include <immintrin.h> 54 #include <emmintrin.h> 55 56 #include <tmmintrin.h> 57 58 59 /** 60 ******************************************************************************* 61 * 62 * @brief 63 * This function performs inverse quantization, inverse transform 64 * type1(DST) and reconstruction for 4x4 input block 65 * 66 * @par Description: 67 * Performs inverse quantization , inverse transform type 1 and adds 68 * prediction data and clips output to 8 bit 69 * 70 * @param[in] pi2_src 71 * Input 4x4 coefficients 72 * 73 * @param[in] pi2_tmp 74 * Temporary 4x4 buffer for storing inverse 75 * transform 1st stage output 76 * 77 * @param[in] pu1_pred 78 * Prediction 4x4 block 79 * 80 * @param[in] pi2_dequant_coeff 81 * Dequant Coeffs 82 * 83 * @param[out] pu1_dst 84 * Output 4x4 block 85 * 86 * @param[in] qp_div 87 * Quantization parameter / 6 88 * 89 * @param[in] qp_rem 90 * Quantization parameter % 6 91 * 92 * @param[in] src_strd 93 * Input stride 94 * 95 * @param[in] pred_strd 96 * Prediction stride 97 * 98 * @param[in] dst_strd 99 * Output Stride 100 * 101 * @param[in] zero_cols 102 * Zero columns in pi2_src 103 * 104 * @returns Void 105 * 106 * @remarks 107 * None 108 * 109 ******************************************************************************* 110 */ 111 112 void ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src, 113 WORD16 *pi2_tmp, 114 UWORD8 *pu1_pred, 115 UWORD8 *pu1_dst, 116 WORD32 src_strd, 117 WORD32 pred_strd, 118 WORD32 dst_strd, 119 WORD32 zero_cols, 120 WORD32 zero_rows) 121 { 122 __m128i m_temp_reg_0; 123 __m128i m_temp_reg_1; 124 __m128i m_temp_reg_2; 125 __m128i m_temp_reg_3; 126 __m128i m_temp_reg_4; 127 __m128i m_temp_reg_10; 128 __m128i m_temp_reg_11; 129 __m128i m_temp_reg_12; 130 __m128i m_temp_reg_13; 131 __m128i m_temp_reg_14; 132 __m128i m_temp_reg_20; 133 __m128i m_temp_reg_21; 134 __m128i m_temp_reg_22; 135 __m128i m_temp_reg_23; 136 __m128i m_temp_reg_24; 137 __m128i m_temp_reg_25; 138 __m128i m_temp_reg_30; 139 __m128i m_temp_reg_31; 140 __m128i m_temp_reg_32; 141 __m128i m_temp_reg_33; 142 __m128i m_temp_reg_34; 143 __m128i m_temp_reg_35; 144 __m128i m_temp_reg_36; 145 __m128i m_rdng_factor; 146 __m128i m_count; 147 148 __m128i m_ge_zero16b_flag_row0; 149 __m128i m_ge_zero16b_flag_row1; 150 __m128i m_ge_zero16b_flag_row2; 151 __m128i m_ge_zero16b_flag_row3; 152 153 __m128i m_zero = _mm_setzero_si128(); 154 155 WORD32 i4_shift = IT_SHIFT_STAGE_1; 156 UNUSED(zero_cols); 157 UNUSED(zero_rows); 158 UNUSED(pi2_tmp); 159 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src); 160 pi2_src += src_strd; 161 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src); 162 pi2_src += src_strd; 163 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src); 164 pi2_src += src_strd; 165 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src); 166 167 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0); 168 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1); 169 m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2); 170 m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3); 171 172 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0); 173 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1); 174 m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2); 175 m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3); 176 177 /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0); 178 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2); 179 180 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 181 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/ 182 183 /* c[4] in m_temp_reg_14 */ 184 /* c[4] = src[0] - src[2] + src[3] */ 185 { 186 m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2); 187 } 188 189 /* c[3] in m_temp_reg_13 */ 190 { 191 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6); 192 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3); 193 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1); 194 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 195 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 196 //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); 197 } 198 199 /* c[0] in m_temp_reg_10 */ 200 { 201 m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2); 202 } 203 204 /* c[1] in m_temp_reg_11 */ 205 { 206 m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3); 207 } 208 209 /* c[2] in m_temp_reg_12 */ 210 { 211 m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3); 212 } 213 214 /* c[4] in m_temp_reg_14 */ 215 /* c[4] = src[0] - src[2] + src[3] */ 216 { 217 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3); 218 } 219 220 /* Stage 1 outputs stored in m_temp_reg_20-23 */ 221 { 222 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5); 223 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1); 224 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10); 225 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 226 //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0 227 228 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6); 229 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3); 230 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11); 231 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 232 //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1 233 234 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 235 236 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5); 237 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1); 238 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11); 239 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 240 //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1 241 242 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6); 243 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3); 244 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12); 245 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 246 //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2 247 248 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 249 250 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6); 251 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3); 252 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10); 253 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 254 //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0 255 256 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5); 257 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1); 258 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12); 259 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 260 //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2 261 262 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6); 263 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3); 264 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1); 265 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 266 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 267 //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4 268 269 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 270 m_count = _mm_cvtsi32_si128(i4_shift); 271 272 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 273 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13); 274 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4); 275 276 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 277 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4); 278 279 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35); 280 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13); 281 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4); 282 283 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor); 284 285 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 286 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 287 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 288 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 289 290 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 291 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 292 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 293 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 294 295 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 296 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 297 298 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 299 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 300 301 } 302 303 /* Stage 2 */ 304 { 305 i4_shift = IT_SHIFT_STAGE_2; 306 307 /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 308 m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20); 309 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 310 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21); 311 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22); 312 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/ 313 314 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30); 315 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31); 316 317 m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 318 m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 319 m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 320 m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 321 322 323 /* c[4] stored in m_temp_reg_4 */ 324 { 325 m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 326 } 327 328 /* c[3] stored in m_temp_reg_3 */ 329 { 330 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6); 331 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3); 332 m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1); 333 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 334 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13); 335 //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); 336 } 337 338 /* c[0] stored in m_temp_reg_0 */ 339 { 340 m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 341 } 342 343 /* c[1] stored in m_temp_reg_1 */ 344 { 345 m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21); 346 } 347 348 /* c[2] stored in m_temp_reg_2 */ 349 { 350 m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23); 351 } 352 353 /* c[4] stored in m_temp_reg_4 */ 354 { 355 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23); 356 } 357 358 /* Stage 2 output generation */ 359 { 360 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5); 361 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1); 362 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0); 363 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 364 //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0 365 366 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6); 367 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3); 368 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1); 369 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 370 //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1 371 372 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 373 374 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5); 375 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1); 376 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1); 377 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 378 //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1 379 380 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6); 381 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3); 382 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2); 383 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 384 //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2 385 386 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 387 388 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6); 389 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3); 390 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0); 391 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 392 //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0 393 394 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5); 395 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1); 396 m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2); 397 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 398 //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2 399 400 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6); 401 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3); 402 m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1); 403 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 404 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13); 405 //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4 406 407 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 408 m_count = _mm_cvtsi32_si128(i4_shift); 409 410 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3); 411 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 412 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4); 413 414 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 415 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4); 416 417 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3); 418 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35); 419 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4); 420 421 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor); 422 423 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 424 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 425 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 426 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 427 428 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 429 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 430 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 431 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 432 433 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 434 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 435 436 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 437 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 438 } 439 440 /* Recon and store */ 441 { 442 WORD32 *pi4_dst = (WORD32 *)pu1_dst; 443 444 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 445 pu1_pred += pred_strd; 446 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 447 pu1_pred += pred_strd; 448 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 449 pu1_pred += pred_strd; 450 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 451 452 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero); 453 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero); 454 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero); 455 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero); 456 457 /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 458 m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1); 459 m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2); 460 m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/ 461 462 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1); 463 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3); 464 465 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0); 466 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1); 467 468 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21); 469 470 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0); 471 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4); 472 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8); 473 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12); 474 pu1_dst += dst_strd; 475 pi4_dst = (WORD32 *)(pu1_dst); 476 477 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 478 pu1_dst += dst_strd; 479 pi4_dst = (WORD32 *)(pu1_dst); 480 481 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 482 pu1_dst += dst_strd; 483 pi4_dst = (WORD32 *)(pu1_dst); 484 485 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 486 } 487 } 488 } 489 490 491 /** 492 ******************************************************************************* 493 * 494 * @brief 495 * This function performs inverse quantization, inverse transform 496 * (DCT) and reconstruction for 4x4 input block 497 * 498 * @par Description: 499 * Performs inverse quantization , inverse transform and adds 500 * prediction data and clips output to 8 bit 501 * 502 * @param[in] pi2_src 503 * Input 4x4 coefficients 504 * 505 * @param[in] pi2_tmp 506 * Temporary 4x4 buffer for storing inverse 507 * transform 1st stage output 508 * 509 * @param[in] pu1_pred 510 * Prediction 4x4 block 511 * 512 * @param[in] pi2_dequant_coeff 513 * Dequant Coeffs 514 * 515 * @param[out] pu1_dst 516 * Output 4x4 block 517 * 518 * @param[in] qp_div 519 * Quantization parameter / 6 520 * 521 * @param[in] qp_rem 522 * Quantization parameter % 6 523 * 524 * @param[in] src_strd 525 * Input stride 526 * 527 * @param[in] pred_strd 528 * Prediction stride 529 * 530 * @param[in] dst_strd 531 * Output Stride 532 * 533 * @param[in] zero_cols 534 * Zero columns in pi2_src 535 * 536 * @returns Void 537 * 538 * @remarks 539 * None 540 * 541 ******************************************************************************* 542 */ 543 544 void ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src, 545 WORD16 *pi2_tmp, 546 UWORD8 *pu1_pred, 547 UWORD8 *pu1_dst, 548 WORD32 src_strd, 549 WORD32 pred_strd, 550 WORD32 dst_strd, 551 WORD32 zero_cols, 552 WORD32 zero_rows) 553 { 554 __m128i m_temp_reg_0; 555 __m128i m_temp_reg_1; 556 __m128i m_temp_reg_2; 557 __m128i m_temp_reg_3; 558 __m128i m_temp_reg_4; 559 __m128i m_temp_reg_10; 560 __m128i m_temp_reg_11; 561 __m128i m_temp_reg_12; 562 __m128i m_temp_reg_13; 563 __m128i m_temp_reg_14; 564 __m128i m_temp_reg_15; 565 __m128i m_temp_reg_20; 566 __m128i m_temp_reg_21; 567 __m128i m_temp_reg_22; 568 __m128i m_temp_reg_23; 569 __m128i m_temp_reg_24; 570 __m128i m_temp_reg_25; 571 __m128i m_temp_reg_30; 572 __m128i m_temp_reg_31; 573 __m128i m_temp_reg_33; 574 __m128i m_temp_reg_34; 575 __m128i m_rdng_factor; 576 __m128i m_count; 577 578 __m128i m_ge_zero16b_flag_row0; 579 __m128i m_ge_zero16b_flag_row1; 580 __m128i m_ge_zero16b_flag_row2; 581 __m128i m_ge_zero16b_flag_row3; 582 583 __m128i m_zero = _mm_setzero_si128(); 584 585 WORD32 i4_shift = IT_SHIFT_STAGE_1; 586 UNUSED(zero_rows); 587 UNUSED(zero_cols); 588 UNUSED(pi2_tmp); 589 590 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src); 591 pi2_src += src_strd; 592 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src); 593 pi2_src += src_strd; 594 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src); 595 pi2_src += src_strd; 596 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src); 597 598 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0); 599 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1); 600 m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2); 601 m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3); 602 603 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0); 604 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1); 605 m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2); 606 m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3); 607 608 /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0); 609 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2); 610 611 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 612 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/ 613 614 /* e */ 615 { 616 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6); 617 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6); 618 } 619 620 /* o */ 621 { 622 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5); 623 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2); 624 m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 625 //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36 626 627 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6); 628 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4); 629 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1); 630 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3); 631 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22); 632 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24); 633 //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83 634 635 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6); 636 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4); 637 m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1); 638 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1); 639 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22); 640 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24); 641 //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83 642 643 m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5); 644 m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2); 645 m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 646 //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36 647 } 648 649 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 650 651 /* e1 stored in m_temp_reg_31 */ 652 { 653 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11); 654 } 655 656 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 657 658 /* e0 stored in m_temp_reg_30 */ 659 { 660 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 661 } 662 663 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 664 m_count = _mm_cvtsi32_si128(i4_shift); 665 666 /* o1 stored in m_temp_reg_33 */ 667 { 668 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13); 669 } 670 671 /* e1 + add */ 672 { 673 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 674 } 675 676 /* e0 + add */ 677 { 678 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 679 } 680 681 /* o0 stored in m_temp_reg_34 */ 682 { 683 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15); 684 } 685 686 /* Stage 1 outputs */ 687 { 688 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33); 689 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33); 690 691 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34); 692 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34); 693 694 695 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 696 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 697 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 698 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 699 700 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 701 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 702 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 703 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 704 705 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 706 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 707 708 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 709 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 710 } 711 712 /* Stage 2 */ 713 { 714 i4_shift = IT_SHIFT_STAGE_2; 715 716 /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 717 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/ 718 719 m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30); 720 m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31); 721 722 m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 723 m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 724 m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 725 m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 726 727 /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20); 728 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21); 729 730 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22); 731 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/ 732 733 /* e */ 734 { 735 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6); 736 } 737 738 /* o */ 739 /*{ 740 m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36 741 m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83 742 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83 743 m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36 744 }*/ 745 { 746 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5); 747 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2); 748 m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1); 749 //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36 750 751 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6); 752 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4); 753 m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1); 754 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23); 755 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2); 756 m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4); 757 //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83 758 759 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6); 760 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4); 761 m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1); 762 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22); 763 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2); 764 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4); 765 //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83 766 767 m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5); 768 m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2); 769 m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1); 770 //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36 771 } 772 773 /* e */ 774 { 775 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6); 776 } 777 778 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 779 780 /* e1 stored in m_temp_reg_31 */ 781 { 782 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11); 783 } 784 785 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 786 787 /* e0 stored in m_temp_reg_30 */ 788 { 789 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 790 } 791 792 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 793 m_count = _mm_cvtsi32_si128(i4_shift); 794 795 /* o1 stored in m_temp_reg_33 */ 796 { 797 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13); 798 } 799 800 /* e1 + add */ 801 { 802 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 803 } 804 805 /* e0 + add */ 806 { 807 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 808 } 809 810 /* o0 stored in m_temp_reg_34 */ 811 { 812 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15); 813 } 814 815 /* Stage 2 outputs */ 816 { 817 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33); 818 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33); 819 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34); 820 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34); 821 822 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 823 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 824 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 825 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 826 827 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 828 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 829 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 830 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 831 832 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 833 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 834 835 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 836 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 837 } 838 839 /* Recon and store */ 840 { 841 UWORD32 *pu4_dst = (UWORD32 *)pu1_dst; 842 843 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 844 pu1_pred += pred_strd; 845 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 846 pu1_pred += pred_strd; 847 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 848 pu1_pred += pred_strd; 849 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 850 851 //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 852 //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1); 853 854 //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2); 855 //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3); 856 857 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero); 858 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero); 859 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero); 860 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero); 861 862 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1); 863 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3); 864 865 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0); 866 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1); 867 868 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21); 869 870 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0); 871 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4); 872 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8); 873 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12); 874 pu1_dst += dst_strd; 875 pu4_dst = (UWORD32 *)(pu1_dst); 876 877 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 878 pu1_dst += dst_strd; 879 pu4_dst = (UWORD32 *)(pu1_dst); 880 881 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 882 pu1_dst += dst_strd; 883 pu4_dst = (UWORD32 *)(pu1_dst); 884 885 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 886 } 887 } 888 } 889 890 891 892 /** 893 ******************************************************************************* 894 * 895 * @brief 896 * This function performs inverse quantization, inverse transform and 897 * reconstruction for 8c8 input block 898 * 899 * @par Description: 900 * Performs inverse quantization , inverse transform and adds the 901 * prediction data and clips output to 8 bit 902 * 903 * @param[in] pi2_src 904 * Input 8x8 coefficients 905 * 906 * @param[in] pi2_tmp 907 * Temporary 8x8 buffer for storing inverse 908 * transform 1st stage output 909 * 910 * @param[in] pu1_pred 911 * Prediction 8x8 block 912 * 913 * @param[in] pi2_dequant_coeff 914 * Dequant Coeffs 915 * 916 * @param[out] pu1_dst 917 * Output 8x8 block 918 * 919 * @param[in] src_strd 920 * Input stride 921 * 922 * @param[in] qp_div 923 * Quantization parameter / 6 924 * 925 * @param[in] qp_rem 926 * Quantization parameter % 6 927 * 928 * @param[in] pred_strd 929 * Prediction stride 930 * 931 * @param[in] dst_strd 932 * Output Stride 933 * 934 * @param[in] zero_cols 935 * Zero columns in pi2_src 936 * 937 * @returns Void 938 * 939 * @remarks 940 * None 941 * 942 ******************************************************************************* 943 */ 944 945 946 void ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src, 947 WORD16 *pi2_tmp, 948 UWORD8 *pu1_pred, 949 UWORD8 *pu1_dst, 950 WORD32 src_strd, 951 WORD32 pred_strd, 952 WORD32 dst_strd, 953 WORD32 zero_cols, 954 WORD32 zero_rows) 955 { 956 __m128i m_temp_reg_0; 957 __m128i m_temp_reg_1; 958 __m128i m_temp_reg_2; 959 __m128i m_temp_reg_3; 960 __m128i m_temp_reg_5; 961 __m128i m_temp_reg_6; 962 __m128i m_temp_reg_7; 963 __m128i m_temp_reg_4; 964 __m128i m_temp_reg_10; 965 __m128i m_temp_reg_11; 966 __m128i m_temp_reg_12; 967 __m128i m_temp_reg_13; 968 __m128i m_temp_reg_14; 969 __m128i m_temp_reg_15; 970 __m128i m_temp_reg_16; 971 __m128i m_temp_reg_17; 972 __m128i m_temp_reg_20; 973 __m128i m_temp_reg_21; 974 __m128i m_temp_reg_22; 975 __m128i m_temp_reg_23; 976 __m128i m_temp_reg_24; 977 __m128i m_temp_reg_25; 978 __m128i m_temp_reg_26; 979 __m128i m_temp_reg_27; 980 __m128i m_temp_reg_30; 981 __m128i m_temp_reg_31; 982 __m128i m_temp_reg_32; 983 __m128i m_temp_reg_33; 984 __m128i m_temp_reg_34; 985 __m128i m_temp_reg_35; 986 __m128i m_temp_reg_36; 987 __m128i m_temp_reg_37; 988 __m128i m_temp_reg_40; 989 __m128i m_temp_reg_41; 990 __m128i m_temp_reg_42; 991 __m128i m_temp_reg_43; 992 __m128i m_temp_reg_44; 993 __m128i m_temp_reg_45; 994 __m128i m_temp_reg_46; 995 __m128i m_temp_reg_47; 996 __m128i m_temp_reg_50; 997 __m128i m_temp_reg_51; 998 __m128i m_temp_reg_52; 999 __m128i m_temp_reg_53; 1000 __m128i m_temp_reg_54; 1001 __m128i m_temp_reg_55; 1002 __m128i m_temp_reg_56; 1003 __m128i m_temp_reg_57; 1004 __m128i m_temp_reg_60; 1005 __m128i m_temp_reg_61; 1006 __m128i m_temp_reg_62; 1007 __m128i m_temp_reg_63; 1008 __m128i m_temp_reg_64; 1009 __m128i m_temp_reg_65; 1010 __m128i m_temp_reg_66; 1011 __m128i m_temp_reg_67; 1012 __m128i m_temp_reg_70; 1013 __m128i m_temp_reg_71; 1014 __m128i m_temp_reg_72; 1015 __m128i m_temp_reg_73; 1016 __m128i m_temp_reg_74; 1017 __m128i m_temp_reg_75; 1018 __m128i m_temp_reg_76; 1019 __m128i m_temp_reg_77; 1020 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 1021 1022 WORD32 check_row_stage_1; /* Lokesh */ 1023 WORD32 check_row_stage_2; /* Lokesh */ 1024 1025 __m128i m_rdng_factor; 1026 //__m128i m_count; 1027 WORD32 i4_shift = IT_SHIFT_STAGE_1; 1028 UNUSED(zero_rows); 1029 UNUSED(zero_cols); 1030 UNUSED(pi2_tmp); 1031 1032 check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0; 1033 check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0; 1034 1035 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src); 1036 pi2_src += src_strd; 1037 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src); 1038 pi2_src += src_strd; 1039 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src); 1040 pi2_src += src_strd; 1041 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src); 1042 pi2_src += src_strd; 1043 1044 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src); 1045 pi2_src += src_strd; 1046 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src); 1047 pi2_src += src_strd; 1048 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src); 1049 pi2_src += src_strd; 1050 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src); 1051 1052 if(!check_row_stage_2) 1053 { 1054 if(!check_row_stage_1) 1055 { 1056 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1057 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1058 { 1059 //Interleaving 0,4 row in 0 , 1 Rishab 1060 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1061 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 1062 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 1063 1064 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1065 1066 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1067 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1068 1069 } 1070 1071 1072 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1073 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1074 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 1075 { 1076 1077 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1078 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1079 1080 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1081 //Interleaving 2,6 row in 4, 5 Rishab 1082 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1083 1084 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1085 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1086 1087 1088 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1089 1090 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1091 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 1092 1093 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1094 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 1095 1096 1097 1098 /* e */ 1099 1100 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1101 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1102 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1103 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1104 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1105 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1106 1107 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1108 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1109 1110 } 1111 1112 /* o */ 1113 { 1114 1115 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1116 { 1117 1118 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1119 //o0:1B*89+3B*75,5B*50+7B*18 1120 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1121 1122 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1123 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1124 1125 1126 1127 /* Column 0 of destination computed here */ 1128 /* It is stored in m_temp_reg_50 */ 1129 /* Column 7 of destination computed here */ 1130 /* It is stored in m_temp_reg_57 */ 1131 /* Upper 8 bytes of both registers are zero due to zero_cols*/ 1132 1133 1134 1135 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1136 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1137 1138 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1139 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1140 1141 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1142 m_temp_reg_63 = _mm_setzero_si128(); 1143 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1144 1145 //o1:1B*75-3B*18,5B*89+7B*50 1146 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1147 1148 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1149 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1150 1151 /* Loading coeff for computing o2 in the next block */ 1152 1153 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1154 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 1155 1156 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1157 1158 1159 1160 /* Column 1 of destination computed here */ 1161 /* It is stored in m_temp_reg_51 */ 1162 /* Column 6 of destination computed here */ 1163 /* It is stored in m_temp_reg_56 */ 1164 1165 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1166 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1167 1168 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1169 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1170 1171 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1172 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1173 1174 //o2:1B*50-3B*89,5B*18+7B*75 1175 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1176 1177 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1178 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1179 1180 1181 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1182 1183 /* Loading coeff for computing o3 in the next block */ 1184 1185 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1186 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 1187 1188 1189 1190 /* Column 2 of destination computed here */ 1191 /* It is stored in m_temp_reg_52 */ 1192 /* Column 5 of destination computed here */ 1193 /* It is stored in m_temp_reg_55 */ 1194 1195 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1196 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1197 1198 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1199 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1200 1201 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1202 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1203 1204 //o3:1B*18-3B*50,5B*75-7B*89 1205 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1206 1207 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1208 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1209 1210 1211 1212 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1213 1214 1215 1216 /* Column 3 of destination computed here */ 1217 /* It is stored in m_temp_reg_53 */ 1218 /* Column 4 of destination computed here */ 1219 /* It is stored in m_temp_reg_54 */ 1220 1221 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1222 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1223 1224 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1225 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1226 1227 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1228 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1229 1230 1231 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1232 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1233 } 1234 } 1235 1236 /* Transpose of the destination 8x8 matrix done here */ 1237 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1238 /* respectively */ 1239 { 1240 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1241 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1242 //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1243 //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1244 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1245 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1246 //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1247 //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1248 1249 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1250 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1251 //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1252 //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1253 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1254 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1255 //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1256 //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1257 1258 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1259 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1260 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1261 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1262 1263 /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1264 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1265 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1266 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1267 */ 1268 m_temp_reg_54 = _mm_setzero_si128(); 1269 m_temp_reg_55 = _mm_setzero_si128(); 1270 m_temp_reg_56 = _mm_setzero_si128(); 1271 m_temp_reg_57 = _mm_setzero_si128(); 1272 } 1273 } 1274 else 1275 { 1276 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1277 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1278 { 1279 //Interleaving 0,4 row in 0 , 1 Rishab 1280 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1281 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 1282 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 1283 1284 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1285 1286 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1287 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1288 1289 } 1290 1291 1292 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1293 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1294 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 1295 { 1296 1297 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1298 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1299 1300 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1301 //Interleaving 2,6 row in 4, 5 Rishab 1302 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1303 1304 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1305 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1306 1307 1308 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1309 1310 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1311 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 1312 1313 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1314 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 1315 1316 1317 1318 /* e */ 1319 1320 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1321 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1322 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1323 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1324 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1325 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1326 1327 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1328 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1329 1330 } 1331 1332 /* o */ 1333 { 1334 1335 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1336 { 1337 1338 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1339 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1340 //o0:1B*89+3B*75,5B*50+7B*18 1341 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1342 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1343 1344 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1345 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1346 1347 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1348 1349 1350 1351 /* Column 0 of destination computed here */ 1352 /* It is stored in m_temp_reg_50 */ 1353 /* Column 7 of destination computed here */ 1354 /* It is stored in m_temp_reg_57 */ 1355 /* Upper 8 bytes of both registers are zero due to zero_cols*/ 1356 1357 1358 1359 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1360 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1361 1362 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1363 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1364 1365 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1366 m_temp_reg_63 = _mm_setzero_si128(); 1367 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1368 1369 //o1:1B*75-3B*18,5B*89+7B*50 1370 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1371 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1372 1373 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1374 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1375 1376 /* Loading coeff for computing o2 in the next block */ 1377 1378 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1379 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 1380 1381 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1382 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 1383 1384 1385 1386 /* Column 1 of destination computed here */ 1387 /* It is stored in m_temp_reg_51 */ 1388 /* Column 6 of destination computed here */ 1389 /* It is stored in m_temp_reg_56 */ 1390 1391 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1392 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1393 1394 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1395 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1396 1397 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1398 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1399 1400 //o2:1B*50-3B*89,5B*18+7B*75 1401 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1402 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1403 1404 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1405 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1406 1407 1408 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1409 1410 /* Loading coeff for computing o3 in the next block */ 1411 1412 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1413 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 1414 1415 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1416 1417 1418 /* Column 2 of destination computed here */ 1419 /* It is stored in m_temp_reg_52 */ 1420 /* Column 5 of destination computed here */ 1421 /* It is stored in m_temp_reg_55 */ 1422 1423 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1424 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1425 1426 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1427 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1428 1429 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1430 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1431 1432 //o3:1B*18-3B*50,5B*75-7B*89 1433 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1434 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1435 1436 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1437 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1438 1439 1440 1441 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1442 1443 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 1444 1445 1446 /* Column 3 of destination computed here */ 1447 /* It is stored in m_temp_reg_53 */ 1448 /* Column 4 of destination computed here */ 1449 /* It is stored in m_temp_reg_54 */ 1450 1451 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1452 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1453 1454 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1455 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1456 1457 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1458 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1459 1460 1461 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1462 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1463 } 1464 } 1465 1466 /* Transpose of the destination 8x8 matrix done here */ 1467 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1468 /* respectively */ 1469 { 1470 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1471 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1472 //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1473 //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1474 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1475 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1476 //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1477 //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1478 1479 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1480 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1481 //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1482 //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1483 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1484 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1485 //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1486 //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1487 1488 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1489 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1490 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1491 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1492 1493 /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1494 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1495 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1496 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1497 */ 1498 m_temp_reg_54 = _mm_setzero_si128(); 1499 m_temp_reg_55 = _mm_setzero_si128(); 1500 m_temp_reg_56 = _mm_setzero_si128(); 1501 m_temp_reg_57 = _mm_setzero_si128(); 1502 } 1503 } 1504 1505 /* Stage 2 */ 1506 i4_shift = IT_SHIFT_STAGE_2; 1507 { 1508 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1509 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1510 { 1511 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add 1512 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub 1513 1514 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 1515 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 1516 1517 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1518 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1519 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1520 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1521 1522 1523 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); 1524 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); 1525 } 1526 1527 1528 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1529 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1530 { 1531 1532 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 1533 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 1534 1535 1536 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1537 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1538 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1539 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1540 1541 /* Loading coeff for computing o0 in the next block */ 1542 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1543 1544 1545 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 1546 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 1547 1548 1549 1550 /* e */ 1551 1552 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1553 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1554 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1555 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1556 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1557 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1558 1559 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1560 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1561 1562 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1563 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1564 1565 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1566 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1567 1568 } 1569 1570 /* o */ 1571 { 1572 1573 //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57); 1574 //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57); 1575 1576 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1577 { 1578 //o0:1B*89+3B*75,1T*89+3T*75 1579 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1580 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1581 1582 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1583 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1584 /* Loading coeff for computing o1 in the next block */ 1585 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1586 1587 1588 1589 /* Column 0 of destination computed here */ 1590 /* It is stored in m_temp_reg_50 */ 1591 /* Column 7 of destination computed here */ 1592 /* It is stored in m_temp_reg_57 */ 1593 1594 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1595 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1596 1597 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1598 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1599 1600 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1601 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1602 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1603 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1604 1605 //o1:1B*75-3B*18,1T*75-3T*18 1606 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 1607 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 1608 1609 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1610 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1611 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1612 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1613 1614 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1615 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1616 1617 1618 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1619 1620 1621 /* Loading coeff for computing o2 in the next block */ 1622 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1623 1624 1625 1626 /* Column 1 of destination computed here */ 1627 /* It is stored in m_temp_reg_51 */ 1628 /* Column 6 of destination computed here */ 1629 /* It is stored in m_temp_reg_56 */ 1630 1631 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1632 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1633 1634 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1635 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1636 1637 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1638 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1639 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1640 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1641 1642 //o2:1B*50-3B*89,5T*18+7T*75. 1643 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1644 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1645 1646 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1647 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1648 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1649 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1650 1651 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1652 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1653 1654 1655 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1656 1657 /* Loading coeff for computing o3 in the next block */ 1658 1659 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1660 1661 1662 /* Column 2 of destination computed here */ 1663 /* It is stored in m_temp_reg_52 */ 1664 /* Column 5 of destination computed here */ 1665 /* It is stored in m_temp_reg_55 */ 1666 1667 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1668 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1669 1670 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 1671 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 1672 1673 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1674 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1675 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1676 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1677 1678 //o3:1B*18-3B*50,1T*18-3T*50 1679 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 1680 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 1681 1682 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1683 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1684 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1685 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1686 1687 1688 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1689 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1690 1691 1692 1693 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1694 1695 1696 /* Column 3 of destination computed here */ 1697 /* It is stored in m_temp_reg_53 */ 1698 /* Column 4 of destination computed here */ 1699 /* It is stored in m_temp_reg_54 */ 1700 1701 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1702 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1703 1704 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 1705 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 1706 1707 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 1708 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 1709 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 1710 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 1711 1712 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 1713 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 1714 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 1715 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 1716 1717 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 1718 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 1719 } 1720 } 1721 1722 /* Transpose of the destination 8x8 matrix done here */ 1723 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1724 /* respectively */ 1725 { 1726 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1727 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1728 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1729 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1730 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1731 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1732 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1733 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1734 1735 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1736 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1737 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1738 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1739 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1740 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1741 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1742 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1743 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1744 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1745 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1746 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1747 1748 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1749 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1750 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1751 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1752 } 1753 1754 /* Recon and store */ 1755 { 1756 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 1757 pu1_pred += pred_strd; 1758 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 1759 pu1_pred += pred_strd; 1760 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 1761 pu1_pred += pred_strd; 1762 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 1763 pu1_pred += pred_strd; 1764 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 1765 pu1_pred += pred_strd; 1766 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 1767 pu1_pred += pred_strd; 1768 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 1769 pu1_pred += pred_strd; 1770 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 1771 1772 m_temp_reg_50 = _mm_setzero_si128(); 1773 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 1774 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 1775 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 1776 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 1777 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 1778 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 1779 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 1780 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 1781 1782 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 1783 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 1784 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 1785 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 1786 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 1787 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 1788 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 1789 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 1790 1791 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 1792 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 1793 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 1794 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 1795 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 1796 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 1797 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 1798 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 1799 1800 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 1801 pu1_dst += dst_strd; 1802 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 1803 pu1_dst += dst_strd; 1804 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 1805 pu1_dst += dst_strd; 1806 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 1807 pu1_dst += dst_strd; 1808 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 1809 pu1_dst += dst_strd; 1810 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 1811 pu1_dst += dst_strd; 1812 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 1813 pu1_dst += dst_strd; 1814 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 1815 pu1_dst += dst_strd; 1816 } 1817 } 1818 } 1819 else 1820 1821 { 1822 1823 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1824 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1825 if(!check_row_stage_1) 1826 { 1827 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1828 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1829 { 1830 //Interleaving 0,4 row in 0 , 1 Rishab 1831 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1832 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 1833 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 1834 1835 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1836 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 1837 1838 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1839 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1840 1841 1842 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1843 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1844 } 1845 1846 1847 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1848 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1849 { 1850 1851 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1852 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1853 1854 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1855 //Interleaving 2,6 row in 4, 5 Rishab 1856 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1857 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 1858 1859 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1860 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1861 1862 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 1863 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 1864 1865 1866 1867 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1868 1869 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1870 //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]); 1871 1872 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1873 //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]); 1874 1875 } 1876 1877 /* e */ 1878 { 1879 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1880 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1881 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1882 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1883 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1884 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1885 1886 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1887 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1888 1889 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1890 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1891 1892 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1893 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1894 1895 } 1896 1897 /* o */ 1898 { 1899 1900 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1901 { 1902 1903 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1904 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1905 // m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77); 1906 // m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77); 1907 //o0:1B*89+3B*75,1T*89+3T*75 1908 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1909 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1910 //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1911 //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 1912 1913 1914 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1915 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1916 1917 //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1918 //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 1919 } 1920 1921 /* Column 0 of destination computed here */ 1922 /* It is stored in m_temp_reg_50 */ 1923 /* Column 7 of destination computed here */ 1924 /* It is stored in m_temp_reg_57 */ 1925 { 1926 1927 1928 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1929 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1930 1931 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1932 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1933 1934 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1935 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1936 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1937 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1938 1939 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1940 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1941 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1942 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1943 1944 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 1945 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1946 //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1947 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1948 //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 1949 1950 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1951 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1952 1953 /* Loading coeff for computing o2 in the next block */ 1954 1955 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1956 1957 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1958 //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 1959 //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 1960 } 1961 1962 /* Column 1 of destination computed here */ 1963 /* It is stored in m_temp_reg_51 */ 1964 /* Column 6 of destination computed here */ 1965 /* It is stored in m_temp_reg_56 */ 1966 { 1967 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1968 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1969 1970 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1971 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1972 1973 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1974 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1975 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1976 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1977 1978 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1979 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1980 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1981 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1982 1983 //o2:1B*50-3B*89,1T*50-3T*89 1984 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1985 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1986 1987 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1988 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1989 1990 1991 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1992 1993 1994 /* Loading coeff for computing o3 in the next block */ 1995 1996 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1997 1998 //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1999 //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2000 } 2001 2002 /* Column 2 of destination computed here */ 2003 /* It is stored in m_temp_reg_52 */ 2004 /* Column 5 of destination computed here */ 2005 /* It is stored in m_temp_reg_55 */ 2006 { 2007 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 2008 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 2009 2010 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 2011 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 2012 2013 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2014 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2015 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2016 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2017 2018 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2019 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2020 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2021 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2022 2023 //o3:1B*18-3B*50,1T*18-3T*50 2024 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 2025 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 2026 2027 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2028 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2029 2030 2031 2032 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 2033 2034 2035 } 2036 2037 /* Column 3 of destination computed here */ 2038 /* It is stored in m_temp_reg_53 */ 2039 /* Column 4 of destination computed here */ 2040 /* It is stored in m_temp_reg_54 */ 2041 { 2042 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 2043 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 2044 2045 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 2046 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 2047 2048 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2049 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2050 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2051 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2052 2053 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2054 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2055 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2056 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2057 2058 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2059 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2060 } 2061 } 2062 2063 /* Transpose of the destination 8x8 matrix done here */ 2064 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 2065 /* respectively */ 2066 { 2067 2068 2069 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 2070 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 2071 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 2072 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 2073 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 2074 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 2075 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 2076 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 2077 2078 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 2079 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 2080 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 2081 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 2082 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 2083 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 2084 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 2085 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 2086 2087 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 2088 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 2089 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 2090 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 2091 2092 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 2093 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 2094 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 2095 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 2096 } 2097 } 2098 else 2099 { 2100 2101 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 2102 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 2103 { 2104 //Interleaving 0,4 row in 0 , 1 Rishab 2105 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 2106 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 2107 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 2108 2109 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 2110 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 2111 2112 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2113 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2114 2115 2116 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2117 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2118 } 2119 2120 2121 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 2122 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 2123 { 2124 2125 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 2126 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 2127 2128 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 2129 //Interleaving 2,6 row in 4, 5 Rishab 2130 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 2131 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 2132 2133 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 2134 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 2135 2136 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 2137 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 2138 2139 2140 2141 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 2142 2143 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 2144 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 2145 2146 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 2147 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 2148 2149 } 2150 2151 /* e */ 2152 { 2153 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 2154 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 2155 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 2156 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 2157 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 2158 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 2159 2160 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 2161 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 2162 2163 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 2164 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 2165 2166 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 2167 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 2168 2169 } 2170 2171 /* o */ 2172 { 2173 2174 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 2175 { 2176 2177 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 2178 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 2179 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 2180 m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 2181 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 2182 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 2183 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 2184 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 2185 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 2186 2187 2188 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2189 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 2190 2191 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2192 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2193 } 2194 2195 /* Column 0 of destination computed here */ 2196 /* It is stored in m_temp_reg_50 */ 2197 /* Column 7 of destination computed here */ 2198 /* It is stored in m_temp_reg_57 */ 2199 { 2200 2201 2202 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 2203 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 2204 2205 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2206 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2207 2208 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2209 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2210 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2211 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2212 2213 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2214 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2215 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2216 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2217 2218 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 2219 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 2220 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 2221 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 2222 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 2223 2224 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2225 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2226 2227 /* Loading coeff for computing o2 in the next block */ 2228 2229 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 2230 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 2231 2232 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 2233 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 2234 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 2235 } 2236 2237 /* Column 1 of destination computed here */ 2238 /* It is stored in m_temp_reg_51 */ 2239 /* Column 6 of destination computed here */ 2240 /* It is stored in m_temp_reg_56 */ 2241 { 2242 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 2243 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 2244 2245 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 2246 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 2247 2248 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2249 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2250 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2251 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2252 2253 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2254 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2255 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2256 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2257 2258 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 2259 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 2260 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 2261 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 2262 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 2263 2264 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2265 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2266 2267 2268 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 2269 2270 2271 /* Loading coeff for computing o3 in the next block */ 2272 2273 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 2274 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 2275 2276 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2277 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2278 } 2279 2280 /* Column 2 of destination computed here */ 2281 /* It is stored in m_temp_reg_52 */ 2282 /* Column 5 of destination computed here */ 2283 /* It is stored in m_temp_reg_55 */ 2284 { 2285 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 2286 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 2287 2288 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 2289 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 2290 2291 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2292 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2293 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2294 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2295 2296 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2297 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2298 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2299 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2300 2301 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 2302 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 2303 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 2304 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 2305 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 2306 2307 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2308 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2309 2310 2311 2312 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 2313 2314 2315 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 2316 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 2317 } 2318 2319 /* Column 3 of destination computed here */ 2320 /* It is stored in m_temp_reg_53 */ 2321 /* Column 4 of destination computed here */ 2322 /* It is stored in m_temp_reg_54 */ 2323 { 2324 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 2325 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 2326 2327 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 2328 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 2329 2330 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2331 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2332 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2333 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2334 2335 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2336 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2337 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2338 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2339 2340 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2341 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2342 } 2343 } 2344 2345 /* Transpose of the destination 8x8 matrix done here */ 2346 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 2347 /* respectively */ 2348 { 2349 2350 2351 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 2352 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 2353 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 2354 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 2355 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 2356 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 2357 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 2358 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 2359 2360 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 2361 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 2362 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 2363 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 2364 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 2365 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 2366 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 2367 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 2368 2369 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 2370 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 2371 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 2372 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 2373 2374 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 2375 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 2376 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 2377 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 2378 } 2379 } 2380 /* Stage 2 */ 2381 2382 i4_shift = IT_SHIFT_STAGE_2; 2383 2384 { 2385 2386 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 2387 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 2388 { 2389 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add 2390 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub 2391 2392 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 2393 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 2394 2395 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2396 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2397 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2398 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2399 2400 2401 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); 2402 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); 2403 } 2404 2405 2406 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 2407 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 2408 { 2409 //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); 2410 //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); 2411 //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); 2412 //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); 2413 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 2414 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 2415 2416 2417 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2418 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2419 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2420 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2421 2422 //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66); 2423 //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62); 2424 2425 /* Loading coeff for computing o0 in the next block */ 2426 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 2427 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 2428 2429 2430 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 2431 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 2432 /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8); 2433 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 2434 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3); 2435 */ 2436 2437 } 2438 2439 /* e */ 2440 { 2441 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 2442 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 2443 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 2444 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 2445 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 2446 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 2447 2448 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 2449 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 2450 2451 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 2452 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 2453 2454 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 2455 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 2456 2457 } 2458 2459 /* o */ 2460 { 2461 /* m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55); 2462 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8); 2463 m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57); 2464 m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8); 2465 m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5); 2466 m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7); 2467 */ 2468 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57); 2469 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57); 2470 2471 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 2472 { 2473 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 2474 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2475 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2476 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 2477 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 2478 2479 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2480 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 2481 /* Loading coeff for computing o1 in the next block */ 2482 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 2483 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 2484 2485 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2486 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2487 } 2488 2489 /* Column 0 of destination computed here */ 2490 /* It is stored in m_temp_reg_50 */ 2491 /* Column 7 of destination computed here */ 2492 /* It is stored in m_temp_reg_57 */ 2493 { 2494 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 2495 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 2496 2497 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2498 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2499 2500 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 2501 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 2502 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 2503 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 2504 2505 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 2506 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 2507 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 2508 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 2509 2510 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 2511 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 2512 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 2513 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 2514 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 2515 2516 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 2517 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 2518 2519 2520 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 2521 2522 2523 /* Loading coeff for computing o2 in the next block */ 2524 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 2525 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 2526 2527 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 2528 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 2529 } 2530 2531 /* Column 1 of destination computed here */ 2532 /* It is stored in m_temp_reg_51 */ 2533 /* Column 6 of destination computed here */ 2534 /* It is stored in m_temp_reg_56 */ 2535 { 2536 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 2537 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 2538 2539 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 2540 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 2541 2542 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 2543 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 2544 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 2545 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 2546 2547 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 2548 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 2549 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 2550 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 2551 2552 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 2553 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2554 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 2555 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2556 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 2557 2558 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 2559 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 2560 2561 2562 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 2563 2564 /* Loading coeff for computing o3 in the next block */ 2565 2566 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 2567 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 2568 2569 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2570 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2571 } 2572 2573 /* Column 2 of destination computed here */ 2574 /* It is stored in m_temp_reg_52 */ 2575 /* Column 5 of destination computed here */ 2576 /* It is stored in m_temp_reg_55 */ 2577 { 2578 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 2579 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 2580 2581 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 2582 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 2583 2584 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 2585 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 2586 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 2587 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 2588 2589 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 2590 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 2591 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 2592 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 2593 2594 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 2595 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 2596 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 2597 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 2598 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 2599 2600 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 2601 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 2602 2603 2604 2605 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 2606 2607 2608 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 2609 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 2610 } 2611 2612 /* Column 3 of destination computed here */ 2613 /* It is stored in m_temp_reg_53 */ 2614 /* Column 4 of destination computed here */ 2615 /* It is stored in m_temp_reg_54 */ 2616 { 2617 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 2618 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 2619 2620 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 2621 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 2622 2623 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 2624 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 2625 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 2626 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 2627 2628 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 2629 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 2630 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 2631 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 2632 2633 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 2634 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 2635 } 2636 } 2637 2638 /* Transpose of the destination 8x8 matrix done here */ 2639 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 2640 /* respectively */ 2641 { 2642 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 2643 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 2644 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 2645 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 2646 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 2647 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 2648 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 2649 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 2650 2651 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 2652 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 2653 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 2654 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 2655 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 2656 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 2657 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 2658 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 2659 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 2660 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 2661 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 2662 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 2663 2664 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 2665 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 2666 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 2667 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 2668 } 2669 2670 /* Recon and store */ 2671 { 2672 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 2673 pu1_pred += pred_strd; 2674 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 2675 pu1_pred += pred_strd; 2676 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 2677 pu1_pred += pred_strd; 2678 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 2679 pu1_pred += pred_strd; 2680 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 2681 pu1_pred += pred_strd; 2682 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 2683 pu1_pred += pred_strd; 2684 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 2685 pu1_pred += pred_strd; 2686 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 2687 2688 2689 m_temp_reg_50 = _mm_setzero_si128(); 2690 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 2691 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 2692 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 2693 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 2694 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 2695 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 2696 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 2697 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 2698 2699 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 2700 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 2701 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 2702 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 2703 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 2704 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 2705 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 2706 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 2707 2708 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 2709 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 2710 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 2711 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 2712 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 2713 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 2714 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 2715 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 2716 2717 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 2718 pu1_dst += dst_strd; 2719 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 2720 pu1_dst += dst_strd; 2721 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 2722 pu1_dst += dst_strd; 2723 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 2724 pu1_dst += dst_strd; 2725 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 2726 pu1_dst += dst_strd; 2727 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 2728 pu1_dst += dst_strd; 2729 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 2730 pu1_dst += dst_strd; 2731 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 2732 pu1_dst += dst_strd; 2733 2734 } 2735 2736 2737 } 2738 2739 2740 } 2741 } 2742 2743