1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ******************************************************************************* 22 * @file 23 * ih264e_intra_modes_eval_ssse3.c 24 * 25 * @brief 26 * This file contains definitions of routines that perform rate distortion 27 * analysis on a macroblock if they are to be coded as intra. 28 * 29 * @author 30 * Ittiam 31 * 32 * @par List of Functions: 33 * ih264e_evaluate_intra16x16_modes_ssse3 34 * ih264e_evaluate_intra_4x4_modes_ssse3 35 * ih264e_evaluate_intra_chroma_modes_ssse3 36 * 37 * @remarks 38 * None 39 * 40 ******************************************************************************* 41 */ 42 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 47 /* System include files */ 48 #include <stdio.h> 49 #include <string.h> 50 #include <limits.h> 51 #include <assert.h> 52 #include <immintrin.h> 53 54 /* User include files */ 55 #include "ih264e_config.h" 56 #include "ih264_typedefs.h" 57 #include "ih264e_defs.h" 58 #include "iv2.h" 59 #include "ive2.h" 60 #include "ih264_debug.h" 61 #include "ih264_defs.h" 62 #include "ih264_macros.h" 63 #include "ih264_intra_pred_filters.h" 64 #include "ih264_structs.h" 65 #include "ih264_common_tables.h" 66 #include "ih264_trans_quant_itrans_iquant.h" 67 #include "ih264_inter_pred_filters.h" 68 #include "ih264_mem_fns.h" 69 #include "ih264_padding.h" 70 #include "ih264_deblk_edge_filters.h" 71 #include "ime_distortion_metrics.h" 72 #include "ih264e_error.h" 73 #include "ih264e_bitstream.h" 74 #include "ime_defs.h" 75 #include "ime_structs.h" 76 #include "ih264_cabac_tables.h" 77 #include "irc_cntrl_param.h" 78 #include "irc_frame_info_collector.h" 79 #include "ih264e_rate_control.h" 80 81 #include "ih264e_cabac_structs.h" 82 #include "ih264e_structs.h" 83 #include "ih264e_cabac.h" 84 #include "ih264e_intra_modes_eval.h" 85 #include "ih264e_globals.h" 86 #include "ime_platform_macros.h" 87 88 89 /*****************************************************************************/ 90 /* Function Definitions */ 91 /*****************************************************************************/ 92 /** 93 ****************************************************************************** 94 * 95 * @brief 96 * evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the 97 * prediction. 98 * 99 * @par Description 100 * This function evaluates first three 16x16 modes and compute corresponding 101 * SAD and returns the buffer predicted with best mode. 102 * 103 * @param[in] pu1_src 104 * UWORD8 pointer to the source 105 * 106 * @param[in] pu1_ngbr_pels_i16 107 * UWORD8 pointer to neighbouring pels 108 * 109 * @param[out] pu1_dst 110 * UWORD8 pointer to the destination 111 * 112 * @param[in] src_strd 113 * integer source stride 114 * 115 * @param[in] dst_strd 116 * integer destination stride 117 * 118 * @param[in] u4_n_avblty 119 * availability of neighbouring pixels 120 * 121 * @param[in] u4_intra_mode 122 * pointer to the variable in which best mode is returned 123 * 124 * @param[in] pu4_sadmin 125 * pointer to the variable in which minimum sad is returned 126 * 127 * @param[in] u4_valid_intra_modes 128 * says what all modes are valid 129 * 130 * @return 131 * None 132 * 133 ****************************************************************************** 134 */ 135 void ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 *pu1_src, 136 UWORD8 *pu1_ngbr_pels_i16, 137 UWORD8 *pu1_dst, 138 UWORD32 src_strd, 139 UWORD32 dst_strd, 140 WORD32 n_avblty, 141 UWORD32 *u4_intra_mode, 142 WORD32 *pu4_sadmin, 143 UWORD32 u4_valid_intra_modes) 144 { 145 UWORD8 *pu1_src_temp; 146 147 WORD32 left, top, horz_flag, vert_flag, dc_flag; 148 WORD32 sad_vert, sad_horz, sad_dc, min_sad; 149 150 WORD32 cnt, dcval; 151 WORD32 src_strd2, src_strd3, src_strd4; 152 WORD32 dst_strd2, dst_strd3, dst_strd4; 153 154 __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b; 155 __m128i val1_16x8b, val2_16x8b, val3_16x8b, val4_16x8b; 156 __m128i sad1_8x16b, sad2_8x16b, sad3_8x16b, sad4_8x16b; 157 158 __m128i sad_8x16b, val_16x8b, zero_vector; 159 160 sad_vert = INT_MAX; 161 sad_horz = INT_MAX; 162 sad_dc = INT_MAX; 163 164 src_strd2 = src_strd << 1; 165 src_strd4 = src_strd << 2; 166 src_strd3 = src_strd + src_strd2; 167 168 dst_strd2 = dst_strd << 1; 169 dst_strd4 = dst_strd << 2; 170 dst_strd3 = dst_strd + dst_strd2; 171 172 left = (n_avblty & LEFT_MB_AVAILABLE_MASK); 173 top = (n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; 174 175 zero_vector = _mm_setzero_si128(); 176 177 horz_flag = left && ((u4_valid_intra_modes & 02) != 0); 178 vert_flag = top && ((u4_valid_intra_modes & 01) != 0); 179 dc_flag = (u4_valid_intra_modes & 04) != 0; 180 181 if(horz_flag) 182 { 183 pu1_src_temp = pu1_src; 184 185 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[15]); 186 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[14]); 187 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[13]); 188 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[12]); 189 190 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); 191 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); 192 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); 193 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); 194 195 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); 196 sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b); 197 sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b); 198 sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b); 199 200 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); 201 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); 202 203 cnt = 11; 204 sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); 205 do 206 { 207 pu1_src_temp += src_strd4; 208 209 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]); 210 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]); 211 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]); 212 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]); 213 214 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); 215 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); 216 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); 217 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); 218 219 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); 220 sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b); 221 sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b); 222 sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b); 223 224 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); 225 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); 226 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); 227 228 cnt -= 4; 229 sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b); 230 } 231 while(cnt >= 0); 232 233 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 234 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 235 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 236 237 sad_horz = _mm_extract_epi16(sad_8x16b, 0); 238 } 239 240 if(vert_flag) 241 { 242 pu1_src_temp = pu1_src; 243 244 val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17)); 245 246 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); 247 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); 248 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); 249 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); 250 251 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); 252 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); 253 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); 254 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); 255 256 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); 257 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); 258 259 cnt = 11; 260 sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); 261 do 262 { 263 pu1_src_temp += src_strd4; 264 265 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); 266 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); 267 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); 268 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); 269 270 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); 271 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); 272 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); 273 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); 274 275 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); 276 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); 277 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); 278 279 cnt -= 4; 280 sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b); 281 } 282 while(cnt >= 0); 283 284 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 285 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 286 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 287 288 sad_vert = _mm_extract_epi16(sad_8x16b, 0); 289 } 290 291 dcval = 0; 292 293 if(left) 294 { 295 val_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels_i16); 296 dcval += 8; 297 298 sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); 299 dcval += _mm_extract_epi16(sad1_8x16b, 0); 300 dcval += _mm_extract_epi16(sad1_8x16b, 4); 301 } 302 if(top) 303 { 304 val_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17)); 305 dcval += 8; 306 307 sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); 308 dcval += _mm_extract_epi16(sad1_8x16b, 0); 309 dcval += _mm_extract_epi16(sad1_8x16b, 4); 310 } 311 dcval = dcval >> (3 + left + top); 312 dcval += ((left == 0) & (top == 0)) << 7; 313 314 if(dc_flag) 315 { 316 pu1_src_temp = pu1_src; 317 val1_16x8b = _mm_set1_epi8(dcval); 318 319 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); 320 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); 321 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); 322 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); 323 324 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); 325 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); 326 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); 327 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); 328 329 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); 330 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); 331 332 cnt = 12; 333 sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); 334 do 335 { 336 pu1_src_temp += src_strd4; 337 338 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); 339 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); 340 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); 341 src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); 342 343 sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); 344 sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); 345 sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); 346 sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); 347 348 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); 349 sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); 350 sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); 351 352 cnt -= 4; 353 sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b); 354 } 355 while(cnt > 0); 356 357 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 358 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 359 sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); 360 361 sad_dc = _mm_extract_epi16(sad_8x16b, 0); 362 } 363 364 // Doing prediction for minimum SAD 365 min_sad = MIN3(sad_horz, sad_vert, sad_dc); 366 if(min_sad < *pu4_sadmin) 367 { 368 *pu4_sadmin = min_sad; 369 if(min_sad == sad_vert) 370 { 371 *u4_intra_mode = VERT_I16x16; 372 val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17)); 373 cnt = 15; 374 do 375 { 376 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b); 377 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b); 378 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b); 379 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b); 380 381 cnt -= 4; 382 pu1_dst += dst_strd4; 383 } 384 while(cnt > 0); 385 } 386 else if(min_sad == sad_horz) 387 { 388 *u4_intra_mode = HORZ_I16x16; 389 cnt = 15; 390 do 391 { 392 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]); 393 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]); 394 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]); 395 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]); 396 397 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b); 398 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val2_16x8b); 399 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val3_16x8b); 400 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val4_16x8b); 401 402 cnt -= 4; 403 pu1_dst += dst_strd4; 404 } 405 while(cnt >= 0); 406 } 407 else 408 { 409 *u4_intra_mode = DC_I16x16; 410 val1_16x8b = _mm_set1_epi8(dcval); 411 cnt = 15; 412 do 413 { 414 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b); 415 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b); 416 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b); 417 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b); 418 419 cnt -= 4; 420 pu1_dst += dst_strd4; 421 } 422 while(cnt > 0); 423 } 424 } 425 } 426 427 /** 428 ****************************************************************************** 429 * 430 * @brief :Evaluate best intra 4x4 mode and do the prediction. 431 * 432 * @par Description 433 * This function evaluates intra 4x4 modes, computes corresponding sad 434 * and returns the buffer predicted with best mode. 435 * 436 * @param[in] pu1_src 437 * UWORD8 pointer to the source 438 * 439 ** @param[in] pu1_ngbr_pels 440 * UWORD8 pointer to neighbouring pels 441 * 442 * @param[out] pu1_dst 443 * UWORD8 pointer to the destination 444 * 445 * @param[in] src_strd 446 * integer source stride 447 * 448 * @param[in] dst_strd 449 * integer destination stride 450 * 451 * @param[in] u4_n_avblty 452 * availability of neighbouring pixels 453 * 454 * @param[in] u4_intra_mode 455 * Pointer to the variable in which best mode is returned 456 * 457 * @param[in] pu4_sadmin 458 * Pointer to the variable in which minimum cost is returned 459 * 460 * @param[in] u4_valid_intra_modes 461 * Says what all modes are valid 462 * 463 * * @param[in] u4_lambda 464 * Lamda value for computing cost from SAD 465 * 466 * @param[in] u4_predictd_mode 467 * Predicted mode for cost computation 468 * 469 * @return none 470 * 471 ****************************************************************************** 472 */ 473 void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src, 474 UWORD8 *pu1_ngbr_pels, 475 UWORD8 *pu1_dst, 476 UWORD32 src_strd, 477 UWORD32 dst_strd, 478 WORD32 u4_n_avblty, 479 UWORD32 *u4_intra_mode, 480 WORD32 *pu4_sadmin, 481 UWORD32 u4_valid_intra_modes, 482 UWORD32 u4_lambda, 483 UWORD32 u4_predictd_mode) 484 { 485 WORD32 left, top; 486 WORD32 sad[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX, 487 INT_MAX, INT_MAX, INT_MAX, INT_MAX }; 488 WORD32 cost[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX, 489 INT_MAX, INT_MAX, INT_MAX, INT_MAX }; 490 491 WORD32 min_cost; 492 UWORD32 lambda4 = u4_lambda << 2; 493 WORD32 dst_strd2, dst_strd3; 494 495 __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b; 496 __m128i pred1_16x8b, pred2_16x8b, pred3_16x8b, pred4_16x8b; 497 __m128i pred5_16x8b, pred6_16x8b, pred7_16x8b, pred8_16x8b; 498 __m128i shuffle_16x8b, zero_vector, mask_low_32b; 499 500 left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); 501 top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; 502 503 dst_strd2 = dst_strd << 1; 504 dst_strd3 = dst_strd + dst_strd2; 505 506 // loading the 4x4 source block and neighbouring pixels 507 { 508 __m128i row1_16x8b, row2_16x8b; 509 510 row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 511 row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 512 left_top_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels); 513 514 pu1_src += src_strd << 1; 515 src_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b); 516 517 row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 518 row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 519 zero_vector = _mm_setzero_si128(); 520 521 row1_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b); 522 src_16x8b = _mm_unpacklo_epi64(src_16x8b, row1_16x8b); 523 } 524 525 /* Computing SADs*/ 526 if(u4_valid_intra_modes & 1)/* VERT mode valid ????*/ 527 { 528 pred0_16x8b = _mm_srli_si128(left_top_16x8b, 5); 529 pred0_16x8b = _mm_shuffle_epi32(pred0_16x8b, 0); 530 sad_8x16b = _mm_sad_epu8(src_16x8b, pred0_16x8b); 531 532 sad[VERT_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 533 cost[VERT_I4x4] = sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ? u4_lambda: lambda4); 534 } 535 536 if(u4_valid_intra_modes & 2)/* HORZ mode valid ????*/ 537 { 538 shuffle_16x8b = _mm_setr_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); 539 pred1_16x8b = _mm_shuffle_epi8(left_top_16x8b, shuffle_16x8b); 540 541 sad_8x16b = _mm_sad_epu8(src_16x8b, pred1_16x8b); 542 543 sad[HORZ_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 544 cost[HORZ_I4x4] = sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ? u4_lambda: lambda4); 545 } 546 547 if(u4_valid_intra_modes & 4)/* DC mode valid ????*/ 548 { 549 if(top + left) 550 { 551 WORD32 shft = 1, dcval = 0; 552 553 __m128i val_16x8b, temp_16x8b, temp_8x16b; 554 555 val_16x8b = _mm_setzero_si128(); 556 557 if(top) 558 { 559 temp_16x8b = _mm_srli_si128(left_top_16x8b, 5); 560 val_16x8b = _mm_alignr_epi8(temp_16x8b, val_16x8b, 4); 561 shft ++; 562 dcval += 2; 563 } 564 if(left) 565 { 566 val_16x8b = _mm_alignr_epi8(left_top_16x8b, val_16x8b, 4); 567 shft++; 568 dcval += 2; 569 } 570 571 temp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); 572 dcval += _mm_extract_epi16(temp_8x16b, 4); 573 dcval = dcval >> shft; 574 pred2_16x8b = _mm_set1_epi8(dcval); 575 } 576 else 577 pred2_16x8b = _mm_set1_epi8(128); 578 579 sad_8x16b = _mm_sad_epu8(src_16x8b, pred2_16x8b); 580 581 sad[DC_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 582 cost[DC_I4x4] = sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ? u4_lambda: lambda4); 583 } 584 585 if(u4_valid_intra_modes > 7)/* if modes other than VERT, HORZ and DC are valid ????*/ 586 { 587 __m128i w11_16x8b, w121_16x8b; 588 __m128i temp1_16x8b, temp2_16x8b; 589 590 /* Performing FILT121 and FILT11 operation for all neighbour values*/ 591 { 592 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b; 593 __m128i const_2_8x16b; 594 595 const_2_8x16b = _mm_set1_epi16(2); 596 597 temp1_8x16b = _mm_unpacklo_epi8(left_top_16x8b, zero_vector); //l3 l2 l1 l0 tl t0 t1 t2 598 temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2); // 0 l3 l2 l1 l0 tl t0 t1 599 temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5); //l3 l3 l2 l1 l0 tl t0 t1 600 601 temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b); //l3+l3 l3+l2 l2+l1... t1+t2 602 temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2); //l3+l3 l3+l3 l3+l2... t0+t1 603 temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5); 604 temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b); //4*l3 l3+2*l3+l2 l3+2*l2+l1... t0+2*t1+t2 605 606 temp1_8x16b = _mm_add_epi16(const_2_8x16b, temp1_8x16b); //4*l3+2 3*l3+l2+2 l3+2*l2+l1+2.. t0+2*t1+t2+2 607 temp1_8x16b = _mm_srli_epi16(temp1_8x16b, 2); 608 609 temp1_16x8b = _mm_srli_si128(left_top_16x8b, 1); 610 w11_16x8b = _mm_avg_epu8(left_top_16x8b, temp1_16x8b); 611 612 temp2_16x8b = _mm_srli_si128(left_top_16x8b, 6); 613 temp2_8x16b = _mm_unpacklo_epi8(temp2_16x8b, zero_vector); //t1 t2 t3 t4 t5 t6 t7 0 614 temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2); //t2 t3 t4 t5 t6 t7 0 0 615 temp3_8x16b = _mm_shufflehi_epi16(temp3_8x16b, 0xd4); //t2 t3 t4 t5 t6 t7 t7 0 616 617 temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b); //t1+t2 t2+t3... t6+t7 t7+t7 0 618 temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2); //t2+t3 t3+t4... t7+t7 0 0 619 temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b); //t1+2*t2+t3 t2+2*t3+t4.. t6+2*t7+t7 t7+t7 0 620 621 temp2_8x16b = _mm_add_epi16(const_2_8x16b, temp2_8x16b); //t1+2*t2+t3+2 t2+2*t3+t4+2 t3+2*t4+t5+2... t6+2*t7+t7+2 t7+t7+2 2 622 temp2_8x16b = _mm_srli_epi16(temp2_8x16b, 2); 623 624 w121_16x8b = _mm_packus_epi16(temp1_8x16b, temp2_8x16b); 625 } 626 627 if(u4_valid_intra_modes & 8)/* DIAG_DL */ 628 { 629 shuffle_16x8b = _mm_setr_epi8( 7, 8, 9, 10, 630 8, 9, 10, 11, 631 9, 10, 11, 12, 632 10, 11, 12, 13); 633 pred3_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b); 634 sad_8x16b = _mm_sad_epu8(src_16x8b, pred3_16x8b); 635 636 sad[DIAG_DL_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 637 cost[DIAG_DL_I4x4] = sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ? u4_lambda: lambda4); 638 } 639 640 if(u4_valid_intra_modes & 16)/* DIAG_DR */ 641 { 642 shuffle_16x8b = _mm_setr_epi8(5, 6, 7, 8, 643 4, 5, 6, 7, 644 3, 4, 5, 6, 645 2, 3, 4, 5); 646 pred4_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b); 647 sad_8x16b = _mm_sad_epu8(src_16x8b, pred4_16x8b); 648 649 sad[DIAG_DR_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 650 cost[DIAG_DR_I4x4] = sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ? u4_lambda: lambda4); 651 } 652 653 if(u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/ 654 { 655 temp1_16x8b = _mm_srli_si128(w121_16x8b, 1); 656 temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, w11_16x8b); 657 shuffle_16x8b = _mm_setr_epi8(12, 13, 14, 15, 658 4, 5, 6, 7, 659 3, 12, 13, 14, 660 2, 4, 5, 6); 661 pred5_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); 662 sad_8x16b = _mm_sad_epu8(src_16x8b, pred5_16x8b); 663 664 sad[VERT_R_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 665 cost[VERT_R_I4x4] = sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ? u4_lambda: lambda4); 666 } 667 668 if(u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/ 669 { 670 temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b); 671 shuffle_16x8b = _mm_setr_epi8(11, 5, 6, 7, 672 10, 4, 11, 5, 673 9, 3, 10, 4, 674 8, 2, 9, 3); 675 pred6_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); 676 sad_8x16b = _mm_sad_epu8(src_16x8b, pred6_16x8b); 677 678 sad[HORZ_D_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 679 cost[HORZ_D_I4x4] = sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ? u4_lambda: lambda4); 680 } 681 682 if(u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/ 683 { 684 temp1_16x8b = _mm_srli_si128(w121_16x8b, 5); 685 temp2_16x8b = _mm_srli_si128(w11_16x8b, 5); 686 temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, temp2_16x8b); 687 shuffle_16x8b = _mm_setr_epi8(8, 9, 10, 11, 688 2, 3, 4, 5, 689 9, 10, 11, 12, 690 3, 4, 5, 6); 691 pred7_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); 692 sad_8x16b = _mm_sad_epu8(src_16x8b, pred7_16x8b); 693 694 sad[VERT_L_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 695 cost[VERT_L_I4x4] = sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ? u4_lambda: lambda4); 696 } 697 698 if(u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/ 699 { 700 temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b); 701 shuffle_16x8b = _mm_setr_epi8(10, 3, 9, 2, 702 9, 2, 8, 1, 703 8, 1, 0, 0, 704 0, 0, 0, 0); 705 pred8_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); 706 sad_8x16b = _mm_sad_epu8(src_16x8b, pred8_16x8b); 707 708 sad[HORZ_U_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 709 cost[HORZ_U_I4x4] = sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ? u4_lambda: lambda4); 710 } 711 712 min_cost = MIN3(MIN3(cost[0], cost[1], cost[2]), 713 MIN3(cost[3], cost[4], cost[5]), 714 MIN3(cost[6], cost[7], cost[8])); 715 } 716 else 717 { /*Only first three modes valid*/ 718 min_cost = MIN3(cost[0], cost[1], cost[2]); 719 } 720 721 *pu4_sadmin = min_cost; 722 723 if(min_cost == cost[0]) 724 { 725 *u4_intra_mode = VERT_I4x4; 726 } 727 else if(min_cost == cost[1]) 728 { 729 *u4_intra_mode = HORZ_I4x4; 730 pred0_16x8b = pred1_16x8b; 731 } 732 else if(min_cost == cost[2]) 733 { 734 *u4_intra_mode = DC_I4x4; 735 pred0_16x8b = pred2_16x8b; 736 } 737 else if(min_cost == cost[3]) 738 { 739 *u4_intra_mode = DIAG_DL_I4x4; 740 pred0_16x8b = pred3_16x8b; 741 } 742 else if(min_cost == cost[4]) 743 { 744 *u4_intra_mode = DIAG_DR_I4x4; 745 pred0_16x8b = pred4_16x8b; 746 } 747 else if(min_cost == cost[5]) 748 { 749 *u4_intra_mode = VERT_R_I4x4; 750 pred0_16x8b = pred5_16x8b; 751 } 752 else if(min_cost == cost[6]) 753 { 754 *u4_intra_mode = HORZ_D_I4x4; 755 pred0_16x8b = pred6_16x8b; 756 } 757 else if(min_cost == cost[7]) 758 { 759 *u4_intra_mode = VERT_L_I4x4; 760 pred0_16x8b = pred7_16x8b; 761 } 762 else if(min_cost == cost[8]) 763 { 764 *u4_intra_mode = HORZ_U_I4x4; 765 pred0_16x8b = pred8_16x8b; 766 } 767 768 mask_low_32b = _mm_set1_epi8(0xff); 769 mask_low_32b = _mm_srli_si128(mask_low_32b, 12); 770 771 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)pu1_dst); 772 pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4); 773 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); 774 pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4); 775 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); 776 pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4); 777 _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); 778 779 } 780 781 /** 782 ****************************************************************************** 783 * 784 * @brief 785 * Evaluate best intra chroma mode (among VERT, HORZ and DC) and do the prediction. 786 * 787 * @par Description 788 * This function evaluates first three intra chroma modes and compute corresponding sad 789 * and return the buffer predicted with best mode. 790 * 791 * @param[in] pu1_src 792 * UWORD8 pointer to the source 793 * 794 ** @param[in] pu1_ngbr_pels 795 * UWORD8 pointer to neighbouring pels 796 * 797 * @param[out] pu1_dst 798 * UWORD8 pointer to the destination 799 * 800 * @param[in] src_strd 801 * integer source stride 802 * 803 * @param[in] dst_strd 804 * integer destination stride 805 * 806 * @param[in] u4_n_avblty 807 * availability of neighbouring pixels 808 * 809 * @param[in] u4_intra_mode 810 * pointer to the variable in which best mode is returned 811 * 812 * @param[in] pu4_sadmin 813 * pointer to the variable in which minimum sad is returned 814 * 815 * @param[in] u4_valid_intra_modes 816 * says what all modes are valid 817 * 818 * @return 819 * none 820 * 821 ****************************************************************************** 822 */ 823 824 void ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 *pu1_src, 825 UWORD8 *pu1_ngbr_pels, 826 UWORD8 *pu1_dst, 827 UWORD32 src_strd, 828 UWORD32 dst_strd, 829 WORD32 u4_n_avblty, 830 UWORD32 *u4_intra_mode, 831 WORD32 *pu4_sadmin, 832 UWORD32 u4_valid_intra_modes) 833 { 834 WORD32 left, top; 835 WORD32 sad_vert = INT_MAX, sad_horz = INT_MAX, sad_dc = INT_MAX, min_sad; 836 837 __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b; 838 __m128i src5_16x8b, src6_16x8b, src7_16x8b, src8_16x8b; 839 840 __m128i top_16x8b, left_16x8b; 841 __m128i pred1_16x8b, pred2_16x8b; 842 __m128i tmp1_8x16b, tmp2_8x16b, sad_8x16b; 843 844 left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); 845 top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; 846 847 //Loading source 848 { 849 src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 850 pu1_src += src_strd; 851 src2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 852 pu1_src += src_strd; 853 src3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 854 pu1_src += src_strd; 855 src4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 856 pu1_src += src_strd; 857 src5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 858 pu1_src += src_strd; 859 src6_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 860 pu1_src += src_strd; 861 src7_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 862 pu1_src += src_strd; 863 src8_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 864 } 865 866 if(left) 867 { 868 left_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels); 869 870 if(u4_valid_intra_modes & 02) //If HORZ mode is valid 871 { 872 __m128i left_tmp_16x8b, left_sh_16x8b; 873 __m128i const_14_15_16x8b; 874 875 const_14_15_16x8b = _mm_set1_epi16(0x0f0e); 876 left_sh_16x8b = _mm_slli_si128(left_16x8b, 2); 877 878 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 1 879 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2 880 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); 881 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred2_16x8b); 882 883 left_tmp_16x8b = _mm_slli_si128(left_16x8b, 4); 884 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); 885 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); 886 887 pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 3 888 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4 889 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); 890 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred2_16x8b); 891 892 left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4); 893 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 894 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); 895 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 896 897 pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 5 898 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6 899 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b); 900 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b); 901 902 left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4); 903 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 904 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); 905 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 906 907 pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 7 908 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8 909 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b); 910 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b); 911 912 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 913 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 914 915 sad_horz = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 916 } 917 } 918 919 if(top) 920 { 921 UWORD8 *pu1_top; 922 923 pu1_top = pu1_ngbr_pels + 2 * BLK8x8SIZE + 2; 924 top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); 925 926 if(u4_valid_intra_modes & 04) //If VERT mode is valid 927 { 928 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, top_16x8b); 929 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, top_16x8b); 930 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); 931 932 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, top_16x8b); 933 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, top_16x8b); 934 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 935 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 936 937 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, top_16x8b); 938 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, top_16x8b); 939 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 940 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 941 942 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, top_16x8b); 943 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, top_16x8b); 944 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 945 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 946 947 sad_vert = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 948 } 949 } 950 951 if(u4_valid_intra_modes & 01) //If DC mode is valid 952 { 953 if(left && top) 954 { 955 WORD32 left_up_u, left_down_u, left_up_v, left_down_v; 956 WORD32 top_left_u, top_right_u, top_left_v, top_right_v; 957 WORD32 dc_1u, dc_1v, dc_2u, dc_2v; 958 959 __m128i val_sh_16x8b; 960 __m128i intrlv_mask_8x16b, zero_vector; 961 962 intrlv_mask_8x16b = _mm_set1_epi16(0x00ff); 963 zero_vector = _mm_setzero_si128(); 964 965 val_sh_16x8b = _mm_srli_si128(left_16x8b, 1); 966 967 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b); 968 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b); 969 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); 970 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); 971 972 left_up_u = _mm_extract_epi16(tmp1_8x16b, 4); 973 left_up_v = _mm_extract_epi16(tmp2_8x16b, 4); 974 left_down_u = _mm_extract_epi16(tmp1_8x16b, 0); 975 left_down_v = _mm_extract_epi16(tmp2_8x16b, 0); 976 977 val_sh_16x8b = _mm_srli_si128(top_16x8b, 1); 978 979 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b); 980 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b); 981 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); 982 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); 983 984 top_left_u = _mm_extract_epi16(tmp1_8x16b, 0); 985 top_left_v = _mm_extract_epi16(tmp2_8x16b, 0); 986 top_right_u = _mm_extract_epi16(tmp1_8x16b, 4); 987 top_right_v = _mm_extract_epi16(tmp2_8x16b, 4); 988 989 // First four rows 990 dc_1u = (left_up_u + top_left_u + 4) >> 3; 991 dc_1v = (left_up_v + top_left_v + 4) >> 3; 992 dc_2u = (top_right_u + 2) >> 2; 993 dc_2v = (top_right_v + 2) >> 2; 994 995 pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, 996 dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v); 997 998 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); 999 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); 1000 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); 1001 1002 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); 1003 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); 1004 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1005 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1006 1007 // Second four rows 1008 dc_1u = (left_down_u + 2) >> 2; 1009 dc_1v = (left_down_v + 2) >> 2; 1010 dc_2u = (left_down_u + top_right_u + 4) >> 3; 1011 dc_2v = (left_down_v + top_right_v + 4) >> 3; 1012 1013 pred2_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, 1014 dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v); 1015 1016 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b); 1017 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b); 1018 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1019 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1020 1021 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b); 1022 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b); 1023 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1024 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1025 1026 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 1027 } 1028 else if(left) 1029 { 1030 WORD32 left_up_u, left_down_u, left_up_v, left_down_v; 1031 WORD32 dc_u, dc_v; 1032 1033 __m128i left_sh_16x8b; 1034 __m128i intrlv_mask_8x16b, zero_vector; 1035 1036 intrlv_mask_8x16b = _mm_set1_epi16(0x00ff); 1037 zero_vector = _mm_setzero_si128(); 1038 1039 left_sh_16x8b = _mm_srli_si128(left_16x8b, 1); 1040 1041 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b); 1042 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_sh_16x8b); 1043 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); 1044 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); 1045 1046 left_up_u = _mm_extract_epi16(tmp1_8x16b, 4); 1047 left_up_v = _mm_extract_epi16(tmp2_8x16b, 4); 1048 left_down_u = _mm_extract_epi16(tmp1_8x16b, 0); 1049 left_down_v = _mm_extract_epi16(tmp2_8x16b, 0); 1050 1051 // First four rows 1052 dc_u = (left_up_u + 2) >> 2; 1053 dc_v = (left_up_v + 2) >> 2; 1054 1055 pred1_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8)); 1056 1057 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); 1058 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); 1059 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); 1060 1061 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); 1062 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); 1063 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1064 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1065 1066 // Second four rows 1067 dc_u = (left_down_u + 2) >> 2; 1068 dc_v = (left_down_v + 2) >> 2; 1069 1070 pred2_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8)); 1071 1072 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b); 1073 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b); 1074 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1075 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1076 1077 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b); 1078 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b); 1079 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1080 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1081 1082 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 1083 } 1084 else if(top) 1085 { 1086 WORD32 top_left_u, top_right_u, top_left_v, top_right_v; 1087 WORD32 dc_1u, dc_1v, dc_2u, dc_2v; 1088 1089 __m128i top_sh_16x8b; 1090 __m128i intrlv_mask_8x16b, zero_vector; 1091 1092 intrlv_mask_8x16b = _mm_set1_epi16(0x00ff); 1093 zero_vector = _mm_setzero_si128(); 1094 1095 top_sh_16x8b = _mm_srli_si128(top_16x8b, 1); 1096 1097 tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b); 1098 tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_sh_16x8b); 1099 tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); 1100 tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); 1101 1102 top_left_u = _mm_extract_epi16(tmp1_8x16b, 0); 1103 top_left_v = _mm_extract_epi16(tmp2_8x16b, 0); 1104 top_right_u = _mm_extract_epi16(tmp1_8x16b, 4); 1105 top_right_v = _mm_extract_epi16(tmp2_8x16b, 4); 1106 1107 dc_1u = (top_left_u + 2) >> 2; 1108 dc_1v = (top_left_v + 2) >> 2; 1109 dc_2u = (top_right_u + 2) >> 2; 1110 dc_2v = (top_right_v + 2) >> 2; 1111 1112 pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, 1113 dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v); 1114 1115 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); 1116 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); 1117 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); 1118 1119 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); 1120 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); 1121 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1122 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1123 1124 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b); 1125 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b); 1126 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1127 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1128 1129 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b); 1130 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b); 1131 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1132 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1133 1134 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 1135 } 1136 else 1137 { 1138 pred1_16x8b = _mm_set1_epi8(128); 1139 1140 tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); 1141 tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); 1142 sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); 1143 1144 tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); 1145 tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); 1146 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1147 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1148 1149 tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b); 1150 tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b); 1151 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1152 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1153 1154 tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b); 1155 tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b); 1156 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); 1157 sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); 1158 1159 sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); 1160 } 1161 } 1162 1163 min_sad = MIN3(sad_horz, sad_vert, sad_dc); 1164 1165 /* Finding minimum SAD and doing corresponding prediction*/ 1166 if(min_sad < *pu4_sadmin) 1167 { 1168 *pu4_sadmin = min_sad; 1169 1170 if(min_sad == sad_dc) 1171 { 1172 *u4_intra_mode = DC_CH_I8x8; 1173 1174 if(!left) 1175 pred2_16x8b = pred1_16x8b; 1176 1177 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1178 pu1_dst += dst_strd; 1179 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1180 pu1_dst += dst_strd; 1181 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1182 pu1_dst += dst_strd; 1183 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1184 pu1_dst += dst_strd; 1185 1186 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1187 pu1_dst += dst_strd; 1188 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1189 pu1_dst += dst_strd; 1190 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1191 pu1_dst += dst_strd; 1192 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1193 } 1194 else if(min_sad == sad_horz) 1195 { 1196 __m128i left_sh_16x8b, const_14_15_16x8b; 1197 1198 *u4_intra_mode = HORZ_CH_I8x8; 1199 1200 const_14_15_16x8b = _mm_set1_epi16(0x0f0e); 1201 1202 left_sh_16x8b = _mm_slli_si128(left_16x8b, 2); 1203 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 1 1204 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2 1205 1206 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1207 pu1_dst += dst_strd; 1208 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1209 1210 left_16x8b = _mm_slli_si128(left_16x8b, 4); 1211 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); 1212 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 3 1213 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4 1214 1215 pu1_dst += dst_strd; 1216 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1217 pu1_dst += dst_strd; 1218 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1219 1220 left_16x8b = _mm_slli_si128(left_16x8b, 4); 1221 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); 1222 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 5 1223 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6 1224 1225 pu1_dst += dst_strd; 1226 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1227 pu1_dst += dst_strd; 1228 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1229 1230 left_16x8b = _mm_slli_si128(left_16x8b, 4); 1231 left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); 1232 pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 7 1233 pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8 1234 1235 pu1_dst += dst_strd; 1236 _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); 1237 pu1_dst += dst_strd; 1238 _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); 1239 } 1240 else 1241 { 1242 *u4_intra_mode = VERT_CH_I8x8; 1243 1244 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1245 pu1_dst += dst_strd; 1246 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1247 pu1_dst += dst_strd; 1248 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1249 pu1_dst += dst_strd; 1250 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1251 pu1_dst += dst_strd; 1252 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1253 pu1_dst += dst_strd; 1254 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1255 pu1_dst += dst_strd; 1256 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1257 pu1_dst += dst_strd; 1258 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1259 } 1260 } 1261 } 1262