1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ******************************************************************************* 22 * @file 23 * ih264_chroma_intra_pred_filters_ssse3.c 24 * 25 * @brief 26 * Contains function definitions for chroma intra prediction filters in x86 27 * intrinsics 28 * 29 * @author 30 * Ittiam 31 * 32 * @par List of Functions: 33 * -ih264_intra_pred_chroma_8x8_mode_horz_ssse3 34 * -ih264_intra_pred_chroma_8x8_mode_vert_ssse3 35 * -ih264_intra_pred_chroma_8x8_mode_plane_ssse3 36 * 37 * @remarks 38 * None 39 * 40 ******************************************************************************* 41 */ 42 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 47 /* System include files */ 48 #include <stdio.h> 49 #include <stddef.h> 50 #include <string.h> 51 52 /* User include files */ 53 #include "ih264_defs.h" 54 #include "ih264_typedefs.h" 55 #include "ih264_macros.h" 56 #include "ih264_platform_macros.h" 57 #include "ih264_intra_pred_filters.h" 58 59 60 /*****************************************************************************/ 61 /* Chroma Intra prediction 8x8 filters */ 62 /*****************************************************************************/ 63 /** 64 ******************************************************************************* 65 * 66 * ih264_intra_pred_chroma_8x8_mode_horz_ssse3 67 * 68 * @brief 69 * Perform Intra prediction for chroma_8x8 mode:Horizontal 70 * 71 * @par Description: 72 * Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 73 * 74 * @param[in] pu1_src 75 * UWORD8 pointer to the source containing alternate U and V samples 76 * 77 * @param[out] pu1_dst 78 * UWORD8 pointer to the destination with alternate U and V samples 79 * 80 * @param[in] src_strd 81 * integer source stride 82 * 83 * @param[in] dst_strd 84 * integer destination stride 85 * 86 * @param[in] ngbr_avail 87 * availability of neighbouring pixels(Not used in this function) 88 * 89 * @returns 90 * 91 * @remarks 92 * None 93 * 94 ****************************************************************************** 95 */ 96 void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, 97 UWORD8 *pu1_dst, 98 WORD32 src_strd, 99 WORD32 dst_strd, 100 WORD32 ngbr_avail) 101 { 102 103 UWORD8 *pu1_left; /* Pointer to start of top predictors */ 104 WORD32 dst_strd2; 105 106 __m128i row1_16x8b, row2_16x8b; 107 108 UNUSED(src_strd); 109 UNUSED(ngbr_avail); 110 111 pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; 112 113 114 dst_strd2 = dst_strd << 1; 115 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left))); 116 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2))); 117 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 118 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 119 120 pu1_dst += dst_strd2; 121 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4))); 122 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6))); 123 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 124 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 125 126 pu1_dst += dst_strd2; 127 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8))); 128 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10))); 129 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 130 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 131 132 pu1_dst += dst_strd2; 133 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12))); 134 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14))); 135 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 136 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 137 } 138 139 /** 140 ******************************************************************************* 141 * 142 * ih264_intra_pred_chroma_8x8_mode_vert_ssse3 143 * 144 * @brief 145 * Perform Intra prediction for chroma_8x8 mode:vertical 146 * 147 * @par Description: 148 * Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 149 * 150 * @param[in] pu1_src 151 * UWORD8 pointer to the source containing alternate U and V samples 152 * 153 * @param[out] pu1_dst 154 * UWORD8 pointer to the destination with alternate U and V samples 155 * 156 * @param[in] src_strd 157 * integer source stride 158 * 159 * @param[in] dst_strd 160 * integer destination stride 161 * 162 * @param[in] ngbr_avail 163 * availability of neighbouring pixels(Not used in this function) 164 * 165 * @returns 166 * 167 * @remarks 168 * None 169 * 170 ******************************************************************************* 171 */ 172 void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src, 173 UWORD8 *pu1_dst, 174 WORD32 src_strd, 175 WORD32 dst_strd, 176 WORD32 ngbr_avail) 177 { 178 UWORD8 *pu1_top; /* Pointer to start of top predictors */ 179 WORD32 dst_strd2; 180 181 __m128i top_16x8b; 182 183 UNUSED(src_strd); 184 UNUSED(ngbr_avail); 185 186 pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; 187 188 top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); 189 190 dst_strd2 = dst_strd << 1; 191 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 192 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 193 194 pu1_dst += dst_strd2; 195 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 196 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 197 198 pu1_dst += dst_strd2; 199 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 200 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 201 202 pu1_dst += dst_strd2; 203 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 204 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 205 } 206 207 /** 208 ******************************************************************************* 209 * 210 * ih264_intra_pred_chroma_8x8_mode_plane_ssse3 211 * 212 * @brief 213 * Perform Intra prediction for chroma_8x8 mode:PLANE 214 * 215 * @par Description: 216 * Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 217 * 218 * @param[in] pu1_src 219 * UWORD8 pointer to the source containing alternate U and V samples 220 * 221 * @param[out] pu1_dst 222 * UWORD8 pointer to the destination with alternate U and V samples 223 * 224 * @param[in] src_strd 225 * integer source stride 226 * 227 * @param[in] dst_strd 228 * integer destination stride 229 * 230 * @param[in] ngbr_avail 231 * availability of neighbouring pixels(Not used in this function) 232 * 233 * @returns 234 * 235 * @remarks 236 * None 237 * 238 ****************************************************************************** 239 */ 240 void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src, 241 UWORD8 *pu1_dst, 242 WORD32 src_strd, 243 WORD32 dst_strd, 244 WORD32 ngbr_avail) 245 { 246 UWORD8 *pu1_left, *pu1_top; 247 WORD32 a_u, a_v, b_u, b_v, c_u, c_v; 248 249 __m128i mul_8x16b, shuffle_8x16b; 250 251 UNUSED(src_strd); 252 UNUSED(ngbr_avail); 253 254 pu1_top = pu1_src + MB_SIZE + 2; 255 pu1_left = pu1_src + MB_SIZE - 2; 256 257 mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4); 258 shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06, 259 0xff01, 0xff03, 0xff05, 0xff07); 260 261 //calculating a, b and c 262 { 263 WORD32 h_u, h_v, v_u, v_v; 264 265 __m128i h_val1_16x8b, h_val2_16x8b; 266 __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; 267 __m128i v_val1_16x8b, v_val2_16x8b; 268 __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b; 269 __m128i hv_val_4x32b; 270 271 h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8)); 272 h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2)); 273 v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14)); 274 v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4)); 275 276 // reversing the order 277 h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b); 278 v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b); 279 280 // separating u and v and 8-bit to 16-bit conversion 281 h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b); 282 h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b); 283 v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b); 284 v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b); 285 286 h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b); 287 v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b); 288 289 h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); 290 v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); 291 292 hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); 293 294 a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4; 295 a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4; 296 297 h_u = _mm_extract_epi16(hv_val_4x32b, 0); 298 h_v = _mm_extract_epi16(hv_val_4x32b, 2); 299 v_u = _mm_extract_epi16(hv_val_4x32b, 4); 300 v_v = _mm_extract_epi16(hv_val_4x32b, 6); 301 302 h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2 303 h_v = (h_v << 16) >> 15; 304 v_u = (v_u << 16) >> 15; 305 v_v = (v_v << 16) >> 15; 306 307 b_u = ((h_u << 4) + h_u + 32) >> 6; 308 b_v = ((h_v << 4) + h_v + 32) >> 6; 309 c_u = ((v_u << 4) + v_u + 32) >> 6; 310 c_v = ((v_v << 4) + v_v + 32) >> 6; 311 } 312 //using a, b and c to compute the fitted plane values 313 { 314 __m128i const_8x16b, c2_8x16b; 315 __m128i res1_l_8x16b, res1_h_8x16b; 316 __m128i res2_l_8x16b, res2_h_8x16b; 317 __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b; 318 __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b; 319 320 WORD32 b_u2, b_v2, b_u3, b_v3; 321 WORD32 const_u, const_v; 322 WORD32 dst_strd2; 323 324 const_u = a_u - (c_u << 1) - c_u + 16; 325 const_v = a_v - (c_v << 1) - c_v + 16; 326 327 b_u2 = b_u << 1; 328 b_v2 = b_v << 1; 329 b_u3 = b_u + b_u2; 330 b_v3 = b_v + b_v2; 331 332 const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v); 333 res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0); 334 //contains {-b*3, -b*2, -b*1, b*0} 335 res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2); 336 //contains {b*1, b*2, b*3, b*4} 337 c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v); 338 339 // rows 1, 2 340 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b); 341 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b); 342 res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 343 res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 344 345 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 346 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 347 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 348 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 349 350 dst_strd2 = dst_strd << 1; 351 c2_8x16b = _mm_slli_epi16(c2_8x16b, 1); 352 353 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 354 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 355 356 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 357 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 358 359 // rows 3, 4 360 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 361 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 362 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 363 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 364 365 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 366 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 367 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 368 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 369 370 pu1_dst += dst_strd2; 371 372 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 373 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 374 375 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 376 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 377 378 // rows 5, 6 379 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 380 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 381 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 382 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 383 384 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 385 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 386 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 387 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 388 389 pu1_dst += dst_strd2; 390 391 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 392 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 393 394 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 395 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 396 397 // rows 7, 8 398 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 399 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 400 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 401 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 402 403 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 404 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 405 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 406 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 407 408 pu1_dst += dst_strd2; 409 410 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 411 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 412 413 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 414 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 415 416 } 417 } 418