1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_chroma_intra_pred_filters_x86_intr.c 22 * 23 * @brief 24 * Contains function Definition for intra prediction interpolation filters 25 * 26 * 27 * @author 28 * Ittiam 29 * 30 * @par List of Functions: 31 * ihevc_intra_pred_chroma_planar_sse42() 32 * 33 * ihevc_intra_pred_chroma_dc_sse42() 34 * 35 * @remarks 36 * None 37 * 38 ******************************************************************************* 39 */ 40 41 42 /*****************************************************************************/ 43 /* File Includes */ 44 /*****************************************************************************/ 45 46 #include "ihevc_typedefs.h" 47 #include "ihevc_macros.h" 48 #include "ihevc_func_selector.h" 49 #include "ihevc_platform_macros.h" 50 #include "ihevc_intra_pred.h" 51 #include "ihevc_chroma_intra_pred.h" 52 #include "ihevc_common_tables.h" 53 #include "ihevc_tables_x86_intr.h" 54 55 #include <mmintrin.h> 56 #include <xmmintrin.h> 57 #include <emmintrin.h> 58 #include <smmintrin.h> 59 #include <immintrin.h> 60 61 62 /****************************************************************************/ 63 /* Constant Macros */ 64 /****************************************************************************/ 65 #define MAX_CU_SIZE 64 66 #define BIT_DEPTH 8 67 #define T32_4NT 128 68 #define T16_4NT 64 69 #define T16C_4NT 64 70 #define T8C_4NT 32 71 /****************************************************************************/ 72 /* Function Macros */ 73 /****************************************************************************/ 74 75 #define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x) 76 77 /* tables to shuffle 8-bit values */ 78 79 /*****************************************************************************/ 80 /* Function Definition */ 81 /*****************************************************************************/ 82 83 84 85 /** 86 ******************************************************************************* 87 * 88 * @brief 89 * Planar Intraprediction with reference neighboring samples location 90 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 91 * to section 8.4.4.2.4 in the standard 92 * 93 * @par Description: 94 * 95 * 96 * @param[in] pu1_src 97 * UWORD8 pointer to the source 98 * 99 * @param[in] pu1_dst 100 * UWORD8 pointer to the destination 101 * 102 * @param[in] src_strd 103 * integer source stride 104 * 105 * @param[in] dst_strd 106 * integer destination stride 107 * 108 * @param[in] nt 109 * integer Transform Block size 110 * 111 * @param[in] mode 112 * integer intraprediction mode 113 * 114 * @returns 115 * 116 * @remarks 117 * None 118 * 119 ******************************************************************************* 120 */ 121 122 void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref, 123 WORD32 src_strd, 124 UWORD8 *pu1_dst, 125 WORD32 dst_strd, 126 WORD32 nt, 127 WORD32 mode) 128 { 129 130 WORD32 row, col; 131 WORD32 log2nt = 5; 132 WORD32 two_nt, three_nt; 133 134 __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 135 __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b; 136 UNUSED(src_strd); 137 UNUSED(mode); 138 139 switch(nt) 140 { 141 case 16: 142 log2nt = 4; 143 break; 144 case 8: 145 log2nt = 3; 146 break; 147 case 4: 148 log2nt = 2; 149 break; 150 default: 151 break; 152 } 153 two_nt = 2 * nt; 154 three_nt = 3 * nt; 155 156 /* Planar filtering */ 157 158 /* setting vallues in registera*/ 159 160 // pu1_ref[2*(two_nt - 1 - row)] 161 // pu1_ref[2 * (three_nt + 1)] 162 // pu1_ref[2 * (two_nt + 1) + col] 163 // pu1_ref[2 * (nt - 1)] 164 165 const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], 166 pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], 167 pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]); 168 169 const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], 170 pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]); 171 172 const_temp4_4x32b = _mm_set1_epi16(nt - 1); 173 const_temp6_4x32b = _mm_set1_epi16(nt); 174 const_temp7_4x32b = _mm_set1_epi16(4); 175 176 zero_8x16b = _mm_set1_epi32(0); 177 178 if(nt % 4 == 0) 179 { 180 const_temp7_4x32b = _mm_set1_epi16(4); 181 182 for(row = 0; row < nt; row++) 183 { 184 __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b; 185 __m128i res_temp3_8x16b; 186 187 const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], 188 pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], 189 pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]); 190 191 const_temp3_4x32b = _mm_set1_epi16((row + 1)); 192 row_8x16b = _mm_set1_epi16((nt - 1 - row)); 193 194 const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0); 195 col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1); 196 197 const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b); 198 199 /*(row + 1) * pu1_ref[nt - 1]*/ 200 res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b); 201 202 /*(row + 1) * pu1_ref[nt - 1] + nt)*/ 203 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 204 205 for(col = 0; col < 2 * nt; col += 8) 206 { 207 __m128i src_temp_8x16b; 208 209 /* loding 8bit 16 pixles*/ 210 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col)); 211 212 src_temp_8x16b = _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/ 213 214 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */ 215 res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b); 216 217 /*(col + 1) * pu1_ref[three_nt + 1]*/ 218 res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b); 219 220 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/ 221 res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b); 222 223 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b); 224 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 225 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b); 226 227 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1)); 228 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); 229 230 _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b); 231 232 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b); 233 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b); 234 } /* inner loop ends here */ 235 } 236 } 237 } 238 239 /** 240 ******************************************************************************* 241 * 242 * @brief 243 * Intraprediction for DC mode with reference neighboring samples location 244 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 245 * to section 8.4.4.2.5 in the standard 246 * 247 * @par Description: 248 * 249 * 250 * @param[in] pu1_src 251 * UWORD8 pointer to the source 252 * 253 * @param[in] pu1_dst 254 * UWORD8 pointer to the destination 255 * 256 * @param[in] src_strd 257 * integer source stride 258 * 259 * @param[in] dst_strd 260 * integer destination stride 261 * 262 * @param[in] nt 263 * integer Transform Block size (Chroma) 264 * 265 * @param[in] mode 266 * integer intraprediction mode 267 * 268 * @returns 269 * 270 * @remarks 271 * None 272 * 273 ******************************************************************************* 274 */ 275 276 void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref, 277 WORD32 src_strd, 278 UWORD8 *pu1_dst, 279 WORD32 dst_strd, 280 WORD32 nt, 281 WORD32 mode) 282 { 283 284 WORD32 acc_dc_u, acc_dc_v; 285 WORD32 dc_val_u, dc_val_v; 286 WORD32 row; 287 WORD32 log2nt = 5; 288 __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask; 289 __m128i src_temp7, src_temp8, src_temp9, src_temp10; 290 __m128i m_zero = _mm_set1_epi32(0); 291 UNUSED(src_strd); 292 UNUSED(mode); 293 294 switch(nt) 295 { 296 case 32: 297 log2nt = 5; 298 break; 299 case 16: 300 log2nt = 4; 301 break; 302 case 8: 303 log2nt = 3; 304 break; 305 case 4: 306 log2nt = 2; 307 break; 308 default: 309 break; 310 } 311 312 acc_dc_u = 0; 313 acc_dc_v = 0; 314 315 /* Calculate DC value for the transform block */ 316 317 m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]); 318 319 if(nt == 16) 320 { 321 __m128i temp_sad; 322 323 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 324 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16)); 325 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32)); 326 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48)); 327 328 src_temp5 = _mm_cvtepu8_epi16(src_temp3); 329 src_temp6 = _mm_cvtepu8_epi16(src_temp4); 330 src_temp9 = _mm_cvtepu8_epi16(src_temp7); 331 src_temp10 = _mm_cvtepu8_epi16(src_temp8); 332 333 src_temp3 = _mm_srli_si128(src_temp3, 8); 334 src_temp4 = _mm_srli_si128(src_temp4, 8); 335 src_temp7 = _mm_srli_si128(src_temp7, 8); 336 src_temp8 = _mm_srli_si128(src_temp8, 8); 337 338 src_temp3 = _mm_cvtepu8_epi16(src_temp3); 339 src_temp4 = _mm_cvtepu8_epi16(src_temp4); 340 src_temp7 = _mm_cvtepu8_epi16(src_temp7); 341 src_temp8 = _mm_cvtepu8_epi16(src_temp8); 342 343 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 344 src_temp6 = _mm_add_epi16(src_temp3, src_temp5); 345 src_temp8 = _mm_add_epi16(src_temp7, src_temp8); 346 src_temp10 = _mm_add_epi16(src_temp9, src_temp10); 347 348 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 349 src_temp8 = _mm_add_epi16(src_temp8, src_temp10); 350 351 src_temp4 = _mm_add_epi16(src_temp4, src_temp8); 352 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 353 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 354 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 355 356 src_temp4 = _mm_cvtepi16_epi32(src_temp4); 357 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 358 acc_dc_u = _mm_cvtsi128_si32(src_temp4); 359 acc_dc_v = _mm_cvtsi128_si32(temp_sad); 360 } 361 362 else if(nt == 8) 363 { 364 __m128i temp_sad; 365 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 366 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16)); 367 368 src_temp5 = _mm_cvtepu8_epi16(src_temp3); 369 src_temp6 = _mm_cvtepu8_epi16(src_temp4); 370 371 src_temp3 = _mm_srli_si128(src_temp3, 8); 372 src_temp4 = _mm_srli_si128(src_temp4, 8); 373 374 src_temp3 = _mm_cvtepu8_epi16(src_temp3); 375 src_temp4 = _mm_cvtepu8_epi16(src_temp4); 376 377 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 378 src_temp6 = _mm_add_epi16(src_temp3, src_temp5); 379 380 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 381 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 382 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 383 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 384 385 src_temp4 = _mm_cvtepi16_epi32(src_temp4); 386 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 387 acc_dc_u = _mm_cvtsi128_si32(src_temp4); 388 acc_dc_v = _mm_cvtsi128_si32(temp_sad); 389 } 390 391 else if(nt == 4) 392 { 393 __m128i temp_sad; 394 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 395 396 src_temp5 = _mm_cvtepu8_epi16(src_temp3); 397 src_temp4 = _mm_srli_si128(src_temp3, 8); 398 src_temp4 = _mm_cvtepu8_epi16(src_temp4); 399 400 src_temp4 = _mm_add_epi16(src_temp4, src_temp5); 401 402 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 403 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 404 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 405 406 src_temp4 = _mm_cvtepi16_epi32(src_temp4); 407 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 408 acc_dc_u = _mm_cvtsi128_si32(src_temp4); 409 acc_dc_v = _mm_cvtsi128_si32(temp_sad); 410 } 411 412 413 acc_dc_u += pu1_ref[6 * nt]; 414 acc_dc_v += pu1_ref[6 * nt + 1]; 415 416 acc_dc_u -= pu1_ref[4 * nt]; 417 acc_dc_v -= pu1_ref[4 * nt + 1]; 418 419 dc_val_u = (acc_dc_u + nt) >> (log2nt + 1); 420 dc_val_v = (acc_dc_v + nt) >> (log2nt + 1); 421 422 dc_val_u = dc_val_u | (dc_val_v << 8); 423 424 /* Fill the remaining rows with DC value*/ 425 426 if(nt == 4) 427 { 428 src_temp1 = _mm_set1_epi16(dc_val_u); 429 430 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 431 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 432 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 433 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 434 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 435 436 } 437 else if(nt == 8) 438 { 439 src_temp1 = _mm_set1_epi16(dc_val_u); 440 441 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 442 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 443 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 444 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 445 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 446 447 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1); 448 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1); 449 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1); 450 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 451 452 } 453 454 else /* nt == 16 */ 455 { 456 457 src_temp1 = _mm_set1_epi16(dc_val_u); 458 459 for(row = 0; row < nt; row += 8) 460 { 461 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 462 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 463 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 464 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 465 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 466 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1); 467 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1); 468 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1); 469 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1); 470 471 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1); 472 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1); 473 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1); 474 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 475 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1); 476 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1); 477 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1); 478 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1); 479 480 pu1_dst += 8 * dst_strd; 481 } 482 483 484 } 485 486 } 487