1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevcd_it_rec_dc_atom_intr.c 22 * 23 * @brief 24 * Platform specific intrinsic implementation of certain functions 25 * 26 * @author 27 * Ittiam 28 * @par List of Functions: 29 * - ihevcd_itrans_recon_dc 30 * - ihevcd_fmt_conv_420sp_to_420p 31 * 32 * @remarks 33 * None 34 * 35 ******************************************************************************* 36 */ 37 38 #include "ihevc_typedefs.h" 39 #include "ihevc_defs.h" 40 #include "ihevc_macros.h" 41 #include "ihevc_platform_macros.h" 42 #include "ihevcd_function_selector.h" 43 44 #include <immintrin.h> 45 46 47 48 49 void ihevcd_itrans_recon_dc_luma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd, 50 WORD32 log2_trans_size, WORD16 i2_coeff_value) 51 { 52 __m128i m_temp_reg_0; 53 __m128i m_temp_reg_1; 54 __m128i m_temp_reg_2; 55 __m128i m_temp_reg_3; 56 __m128i m_temp_reg_4; 57 __m128i m_temp_reg_5; 58 __m128i m_temp_reg_6; 59 __m128i m_temp_reg_7; 60 __m128i m_temp_reg_8; 61 __m128i m_temp_reg_9; 62 __m128i m_temp_reg_10; 63 __m128i m_temp_reg_11; 64 __m128i m_temp_reg_12; 65 __m128i m_temp_reg_13; 66 __m128i m_temp_reg_14; 67 __m128i m_temp_reg_15; 68 __m128i m_temp_reg_20, zero_8x16b; 69 __m128i *pi4_dst = (__m128i *)pu1_dst; 70 71 72 73 WORD32 add, shift; 74 WORD32 dc_value, quant_out; 75 WORD32 trans_size; 76 77 78 79 trans_size = (1 << log2_trans_size); 80 81 quant_out = i2_coeff_value; 82 83 shift = IT_SHIFT_STAGE_1; 84 add = 1 << (shift - 1); 85 dc_value = CLIP_S16((quant_out * 64 + add) >> shift); 86 shift = IT_SHIFT_STAGE_2; 87 add = 1 << (shift - 1); 88 dc_value = CLIP_S16((dc_value * 64 + add) >> shift); 89 90 /*Replicate the DC value within 16 bits in 128 bit register*/ 91 m_temp_reg_20 = _mm_set1_epi16(dc_value); 92 zero_8x16b = _mm_setzero_si128(); 93 94 if(trans_size == 4) 95 { 96 WORD32 *pi4_dst = (WORD32 *)pu1_dst; 97 98 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred)); 99 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 100 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 101 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 102 103 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1); 104 m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3); 105 106 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b); 107 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b); 108 109 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 110 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 111 112 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7); 113 114 115 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8); 116 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4); 117 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8); 118 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12); 119 pu1_dst += dst_strd; 120 pi4_dst = (WORD32 *)(pu1_dst); 121 122 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 123 pu1_dst += dst_strd; 124 pi4_dst = (WORD32 *)(pu1_dst); 125 126 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 127 pu1_dst += dst_strd; 128 pi4_dst = (WORD32 *)(pu1_dst); 129 130 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 131 } 132 else 133 { 134 WORD32 i, j; 135 136 for(i = 1; i <= trans_size; i += 4) 137 { 138 for(j = 1; j <= trans_size; j += 8) 139 { 140 141 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 142 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 143 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 144 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 145 146 147 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b); 148 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b); 149 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b); 150 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b); 151 152 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 153 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 154 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20); 155 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20); 156 157 pi4_dst = (__m128i *)(pu1_dst); 158 159 m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9); 160 _mm_storel_epi64(pi4_dst, m_temp_reg_12); 161 162 pi4_dst = (__m128i *)(pu1_dst + dst_strd); 163 164 m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8); 165 _mm_storel_epi64(pi4_dst, m_temp_reg_13); 166 167 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd); 168 169 m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11); 170 _mm_storel_epi64(pi4_dst, m_temp_reg_14); 171 172 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd); 173 174 m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8); 175 _mm_storel_epi64(pi4_dst, m_temp_reg_15); 176 177 pu1_pred += 8; 178 pu1_dst += 8; 179 } 180 pu1_pred += 4 * pred_strd - trans_size; 181 pu1_dst += 4 * dst_strd - trans_size; 182 } 183 } 184 185 186 } 187 188 void ihevcd_itrans_recon_dc_chroma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd, 189 WORD32 log2_trans_size, WORD16 i2_coeff_value) 190 { 191 __m128i m_temp_reg_0; 192 __m128i m_temp_reg_1; 193 __m128i m_temp_reg_2; 194 __m128i m_temp_reg_3; 195 __m128i m_temp_reg_4; 196 __m128i m_temp_reg_5; 197 __m128i m_temp_reg_6; 198 __m128i m_temp_reg_7; 199 __m128i m_temp_reg_8; 200 __m128i m_temp_reg_9; 201 __m128i m_temp_reg_10; 202 __m128i m_temp_reg_11; 203 __m128i m_temp_reg_12; 204 __m128i m_temp_reg_13; 205 __m128i m_temp_reg_14; 206 __m128i m_temp_reg_15; 207 __m128i m_temp_reg_20, zero_8x16b; 208 __m128i *pi4_dst = (__m128i *)pu1_dst; 209 210 211 WORD32 add, shift; 212 WORD32 dc_value, quant_out; 213 WORD32 trans_size; 214 215 216 WORD32 shuffle_mask_4x4 = 0x06040200; 217 WORD32 unchanged_mask_4x4 = 0x07050301; 218 LWORD64 shuffle_mask = 0x0E0C0A0806040200LL; 219 LWORD64 unchanged_mask = 0x0F0D0B0907050301LL; 220 221 trans_size = (1 << log2_trans_size); 222 223 quant_out = i2_coeff_value; 224 225 shift = IT_SHIFT_STAGE_1; 226 add = 1 << (shift - 1); 227 dc_value = CLIP_S16((quant_out * 64 + add) >> shift); 228 shift = IT_SHIFT_STAGE_2; 229 add = 1 << (shift - 1); 230 dc_value = CLIP_S16((dc_value * 64 + add) >> shift); 231 232 /*Replicate the DC value within 16 bits in 128 bit register*/ 233 m_temp_reg_20 = _mm_set1_epi16(dc_value); 234 zero_8x16b = _mm_setzero_si128(); 235 236 if(trans_size == 4) 237 { 238 __m128i chroma_shuffle_mask_16x8b; 239 __m128i chroma_unchanged_mask_16x8b; 240 chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4); 241 chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4); 242 243 /*Load the prediction data*/ 244 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred)); 245 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 246 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 247 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 248 249 m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b); 250 m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b); 251 m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b); 252 m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b); 253 254 m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 255 m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 256 257 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b); 258 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b); 259 260 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 261 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 262 263 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/ 264 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst); 265 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 266 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd)); 267 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd)); 268 269 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b); 270 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b); 271 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b); 272 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b); 273 274 275 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7); 276 m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0); 277 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 278 m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1); 279 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 280 m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2); 281 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 282 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3); 283 284 /*Store the result in the destination*/ 285 _mm_storel_epi64(pi4_dst, m_temp_reg_9); 286 pu1_dst += dst_strd; 287 pi4_dst = (__m128i *)(pu1_dst); 288 289 290 _mm_storel_epi64(pi4_dst, m_temp_reg_10); 291 pu1_dst += dst_strd; 292 pi4_dst = (__m128i *)(pu1_dst); 293 294 _mm_storel_epi64(pi4_dst, m_temp_reg_11); 295 pu1_dst += dst_strd; 296 pi4_dst = (__m128i *)(pu1_dst); 297 298 _mm_storel_epi64(pi4_dst, m_temp_reg_12); 299 } 300 else 301 { 302 WORD32 i, j; 303 __m128i chroma_shuffle_mask_16x8b; 304 __m128i chroma_unchanged_mask_16x8b; 305 chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask)); 306 chroma_unchanged_mask_16x8b = 307 _mm_loadl_epi64((__m128i *)(&unchanged_mask)); 308 309 for(i = 0; i < trans_size; i += 4) 310 { 311 for(j = 0; j < trans_size; j += 8) 312 { 313 314 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred); 315 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd)); 316 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd)); 317 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd)); 318 319 /*Retain only one chroma component*/ 320 m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b); 321 m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b); 322 m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b); 323 m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b); 324 325 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b); 326 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b); 327 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b); 328 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b); 329 330 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 331 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 332 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20); 333 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20); 334 335 336 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/ 337 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst); 338 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd)); 339 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd)); 340 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd)); 341 342 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b); 343 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b); 344 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b); 345 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b); 346 347 m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9); 348 m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11); 349 350 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0); 351 m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8); 352 m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1); 353 354 m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2); 355 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8); 356 m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3); 357 358 /*Store the result in the destination*/ 359 pi4_dst = (__m128i *)(pu1_dst); 360 361 _mm_storel_epi64(pi4_dst, m_temp_reg_12); 362 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8); 363 364 pi4_dst = (__m128i *)(pu1_dst + 8); 365 _mm_storel_epi64(pi4_dst, m_temp_reg_8); 366 367 pi4_dst = (__m128i *)(pu1_dst + dst_strd); 368 369 _mm_storel_epi64(pi4_dst, m_temp_reg_13); 370 m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8); 371 372 pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8); 373 _mm_storel_epi64(pi4_dst, m_temp_reg_9); 374 375 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd); 376 377 _mm_storel_epi64(pi4_dst, m_temp_reg_14); 378 m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8); 379 380 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8); 381 _mm_storel_epi64(pi4_dst, m_temp_reg_10); 382 383 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd); 384 385 _mm_storel_epi64(pi4_dst, m_temp_reg_15); 386 m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8); 387 388 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8); 389 _mm_storel_epi64(pi4_dst, m_temp_reg_11); 390 391 pu1_pred += 16; 392 pu1_dst += 16; 393 } 394 395 pu1_pred += 4 * pred_strd - 2 * trans_size; 396 pu1_dst += 4 * dst_strd - 2 * trans_size; 397 } 398 } 399 400 401 } 402