1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevcd_it_rec_dc_x86_intr.c 22 * 23 * @brief 24 * Platform specific intrinsic implementation of certain functions 25 * 26 * @author 27 * Ittiam 28 * @par List of Functions: 29 * - ihevcd_itrans_recon_dc 30 * - ihevcd_fmt_conv_420sp_to_420p 31 * 32 * @remarks 33 * None 34 * 35 ******************************************************************************* 36 */ 37 38 #include "ihevc_typedefs.h" 39 #include "ihevc_defs.h" 40 #include "ihevc_macros.h" 41 #include "ihevc_platform_macros.h" 42 #include "ihevcd_function_selector.h" 43 44 #include <immintrin.h> 45 46 47 void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd, 48 WORD32 log2_trans_size, WORD16 i2_coeff_value) 49 { 50 __m128i m_temp_reg_0; 51 __m128i m_temp_reg_1; 52 __m128i m_temp_reg_2; 53 __m128i m_temp_reg_3; 54 __m128i m_temp_reg_4; 55 __m128i m_temp_reg_5; 56 __m128i m_temp_reg_6; 57 __m128i m_temp_reg_7; 58 __m128i m_temp_reg_8; 59 __m128i m_temp_reg_9; 60 __m128i m_temp_reg_10; 61 __m128i m_temp_reg_11; 62 __m128i m_temp_reg_12; 63 __m128i m_temp_reg_13; 64 __m128i m_temp_reg_14; 65 __m128i m_temp_reg_15; 66 __m128i m_temp_reg_20, zero_8x16b; 67 __m128i *pi4_dst = (__m128i *)pu1_dst; 68 69 70 //WORD32 row,col; 71 WORD32 add, shift; 72 WORD32 dc_value, quant_out; 73 WORD32 trans_size; 74 75 76 77 78 trans_size = (1 << log2_trans_size); 79 80 quant_out = i2_coeff_value; 81 82 shift = IT_SHIFT_STAGE_1; 83 add = 1 << (shift - 1); 84 dc_value = CLIP_S16((quant_out * 64 + add) >> shift); 85 shift = IT_SHIFT_STAGE_2; 86 add = 1 << (shift - 1); 87 dc_value = CLIP_S16((dc_value * 64 + add) >> shift); 88 89 /*Replicate the DC value within 16 bits in 128 bit register*/ 90 m_temp_reg_20 = _mm_set1_epi16(dc_value); 91 zero_8x16b = _mm_setzero_si128(); 92 93 if(trans_size == 4) 94 { 95 WORD32 *pi4_dst = (WORD32 *)pu1_dst; 96 97 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred)); 98 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 99 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 100 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 101 102 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1); 103 m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3); 104 105 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b); 106 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b); 107 108 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 109 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 110 111 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7); 112 113 114 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8); 115 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4); 116 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8); 117 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12); 118 pu1_dst += dst_strd; 119 pi4_dst = (WORD32 *)(pu1_dst); 120 121 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 122 pu1_dst += dst_strd; 123 pi4_dst = (WORD32 *)(pu1_dst); 124 125 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 126 pu1_dst += dst_strd; 127 pi4_dst = (WORD32 *)(pu1_dst); 128 129 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 130 } 131 else 132 { 133 WORD32 i, j; 134 135 for(i = 1; i <= trans_size; i += 4) 136 { 137 for(j = 1; j <= trans_size; j += 8) 138 { 139 140 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 141 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 142 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 143 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 144 145 146 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b); 147 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b); 148 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b); 149 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b); 150 151 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 152 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 153 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20); 154 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20); 155 156 pi4_dst = (__m128i *)(pu1_dst); 157 158 m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9); 159 _mm_storel_epi64(pi4_dst, m_temp_reg_12); 160 161 pi4_dst = (__m128i *)(pu1_dst + dst_strd); 162 163 m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8); 164 _mm_storel_epi64(pi4_dst, m_temp_reg_13); 165 166 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd); 167 168 m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11); 169 _mm_storel_epi64(pi4_dst, m_temp_reg_14); 170 171 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd); 172 173 m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8); 174 _mm_storel_epi64(pi4_dst, m_temp_reg_15); 175 176 pu1_pred += 8; 177 pu1_dst += 8; 178 } 179 pu1_pred += 4 * pred_strd - trans_size; 180 pu1_dst += 4 * dst_strd - trans_size; 181 } 182 } 183 184 185 } 186 187 void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd, 188 WORD32 log2_trans_size, WORD16 i2_coeff_value) 189 { 190 __m128i m_temp_reg_0; 191 __m128i m_temp_reg_1; 192 __m128i m_temp_reg_2; 193 __m128i m_temp_reg_3; 194 __m128i m_temp_reg_4; 195 __m128i m_temp_reg_5; 196 __m128i m_temp_reg_6; 197 __m128i m_temp_reg_7; 198 __m128i m_temp_reg_8; 199 __m128i m_temp_reg_9; 200 __m128i m_temp_reg_10; 201 __m128i m_temp_reg_11; 202 __m128i m_temp_reg_12; 203 __m128i m_temp_reg_13; 204 __m128i m_temp_reg_14; 205 __m128i m_temp_reg_15; 206 __m128i m_temp_reg_20, zero_8x16b; 207 __m128i *pi4_dst = (__m128i *)pu1_dst; 208 209 210 //WORD32 row,col; 211 WORD32 add, shift; 212 WORD32 dc_value, quant_out; 213 WORD32 trans_size; 214 215 216 WORD32 shuffle_mask_4x4 = 0x06040200; 217 WORD32 unchanged_mask_4x4 = 0x07050301; 218 LWORD64 shuffle_mask = 0x0E0C0A0806040200LL; 219 LWORD64 unchanged_mask = 0x0F0D0B0907050301LL; 220 221 trans_size = (1 << log2_trans_size); 222 223 quant_out = i2_coeff_value; 224 225 shift = IT_SHIFT_STAGE_1; 226 add = 1 << (shift - 1); 227 dc_value = CLIP_S16((quant_out * 64 + add) >> shift); 228 shift = IT_SHIFT_STAGE_2; 229 add = 1 << (shift - 1); 230 dc_value = CLIP_S16((dc_value * 64 + add) >> shift); 231 232 /*Replicate the DC value within 16 bits in 128 bit register*/ 233 m_temp_reg_20 = _mm_set1_epi16(dc_value); 234 zero_8x16b = _mm_setzero_si128(); 235 236 if(trans_size == 4) 237 { 238 __m128i chroma_shuffle_mask_16x8b; 239 __m128i chroma_unchanged_mask_16x8b; 240 chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4); 241 chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4); 242 243 /*Load the prediction data*/ 244 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred)); 245 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 246 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 247 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 248 249 m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b); 250 m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b); 251 m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b); 252 m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b); 253 254 m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 255 m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 256 257 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b); 258 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b); 259 260 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 261 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 262 263 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/ 264 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst); 265 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 266 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd)); 267 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd)); 268 269 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b); 270 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b); 271 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b); 272 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b); 273 274 275 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7); 276 m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0); 277 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 278 m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1); 279 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 280 m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2); 281 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 282 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3); 283 284 /*Store the result in the destination*/ 285 _mm_storel_epi64(pi4_dst, m_temp_reg_9); 286 pu1_dst += dst_strd; 287 pi4_dst = (__m128i *)(pu1_dst); 288 289 290 _mm_storel_epi64(pi4_dst, m_temp_reg_10); 291 pu1_dst += dst_strd; 292 pi4_dst = (__m128i *)(pu1_dst); 293 294 _mm_storel_epi64(pi4_dst, m_temp_reg_11); 295 pu1_dst += dst_strd; 296 pi4_dst = (__m128i *)(pu1_dst); 297 298 _mm_storel_epi64(pi4_dst, m_temp_reg_12); 299 } 300 else 301 { 302 WORD32 i, j; 303 __m128i chroma_shuffle_mask_16x8b; 304 __m128i chroma_unchanged_mask_16x8b; 305 chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask)); 306 chroma_unchanged_mask_16x8b = 307 _mm_loadl_epi64((__m128i *)(&unchanged_mask)); 308 309 for(i = 0; i < trans_size; i += 4) 310 { 311 for(j = 0; j < trans_size; j += 8) 312 { 313 314 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred); 315 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd)); 316 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd)); 317 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd)); 318 319 /*Retain only one chroma component*/ 320 m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b); 321 m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b); 322 m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b); 323 m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b); 324 325 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b); 326 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b); 327 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b); 328 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b); 329 330 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 331 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 332 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20); 333 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20); 334 335 336 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/ 337 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst); 338 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd)); 339 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd)); 340 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd)); 341 342 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b); 343 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b); 344 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b); 345 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b); 346 347 m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9); 348 m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11); 349 350 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0); 351 m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8); 352 m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1); 353 354 m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2); 355 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8); 356 m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3); 357 358 /*Store the result in the destination*/ 359 pi4_dst = (__m128i *)(pu1_dst); 360 361 _mm_storel_epi64(pi4_dst, m_temp_reg_12); 362 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8); 363 364 pi4_dst = (__m128i *)(pu1_dst + 8); 365 _mm_storel_epi64(pi4_dst, m_temp_reg_8); 366 367 pi4_dst = (__m128i *)(pu1_dst + dst_strd); 368 369 _mm_storel_epi64(pi4_dst, m_temp_reg_13); 370 m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8); 371 372 pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8); 373 _mm_storel_epi64(pi4_dst, m_temp_reg_9); 374 375 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd); 376 377 _mm_storel_epi64(pi4_dst, m_temp_reg_14); 378 m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8); 379 380 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8); 381 _mm_storel_epi64(pi4_dst, m_temp_reg_10); 382 383 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd); 384 385 _mm_storel_epi64(pi4_dst, m_temp_reg_15); 386 m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8); 387 388 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8); 389 _mm_storel_epi64(pi4_dst, m_temp_reg_11); 390 391 pu1_pred += 16; 392 pu1_dst += 16; 393 } 394 395 pu1_pred += 4 * pred_strd - 2 * trans_size; 396 pu1_dst += 4 * dst_strd - 2 * trans_size; 397 } 398 } 399 400 401 } 402