1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ******************************************************************************* 22 * @file 23 * ih264_iquant_itrans_recon_dc_ssse3.c 24 * 25 * @brief 26 * Contains function definitions for inverse quantization, inverse 27 * transform and reconstruction 28 * 29 * @author 30 * Mohit [100664] 31 * 32 * @par List of Functions: 33 * - ih264_iquant_itrans_recon_4x4_dc_ssse3() 34 * - ih264_iquant_itrans_recon_8x8_dc_ssse3() 35 * 36 * @remarks 37 * None 38 * 39 ******************************************************************************* 40 */ 41 /* User include files */ 42 #include "ih264_typedefs.h" 43 #include "ih264_defs.h" 44 #include "ih264_trans_macros.h" 45 #include "ih264_macros.h" 46 #include "ih264_platform_macros.h" 47 #include "ih264_trans_data.h" 48 #include "ih264_size_defs.h" 49 #include "ih264_structs.h" 50 #include "ih264_trans_quant_itrans_iquant.h" 51 #include <immintrin.h> 52 53 /* 54 ******************************************************************************** 55 * 56 * @brief This function reconstructs a 4x4 sub block from quantized resiude and 57 * prediction buffer for dc input pattern only, i.e. only the (0,0) element of the input 58 * 4x4 block is non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c 59 * 60 * @par Description: 61 * The quantized residue is first inverse quantized, then inverse transformed. 62 * This inverse transformed content is added to the prediction buffer to recon- 63 * struct the end output 64 * 65 * @param[in] pi2_src 66 * quantized 4x4 block 67 * 68 * @param[in] pu1_pred 69 * prediction 4x4 block 70 * 71 * @param[out] pu1_out 72 * reconstructed 4x4 block 73 * 74 * @param[in] src_strd 75 * quantization buffer stride 76 * 77 * @param[in] pred_strd, 78 * Prediction buffer stride 79 * 80 * @param[in] out_strd 81 * recon buffer Stride 82 * 83 * @param[in] pu2_scaling_list 84 * pointer to scaling list 85 * 86 * @param[in] pu2_norm_adjust 87 * pointer to inverse scale matrix 88 * 89 * @param[in] u4_qp_div_6 90 * Floor (qp/6) 91 * 92 * @param[in] pi4_tmp 93 * temporary buffer of size 1*16 94 * 95 * @returns none 96 * 97 * @remarks none 98 * 99 ******************************************************************************* 100 */ 101 void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src, 102 UWORD8 *pu1_pred, 103 UWORD8 *pu1_out, 104 WORD32 pred_strd, 105 WORD32 out_strd, 106 const UWORD16 *pu2_iscal_mat, 107 const UWORD16 *pu2_weigh_mat, 108 UWORD32 u4_qp_div_6, 109 WORD16 *pi2_tmp, 110 WORD32 iq_start_idx, 111 WORD16 *pi2_dc_ld_addr) 112 { 113 UWORD32 *pu4_out = (UWORD32 *)pu1_out; 114 WORD32 q0 = pi2_src[0]; 115 WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; 116 117 __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3; 118 __m128i sign_reg; 119 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 120 __m128i temp4, temp5, temp6, temp7; 121 __m128i value_add; 122 123 UNUSED (pi2_tmp); 124 125 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); 126 127 if (iq_start_idx != 0 ) 128 q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case 129 130 i_macro = ((q0 + 32) >> 6); 131 132 value_add = _mm_set1_epi16(i_macro); 133 134 zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 135 //Load pred buffer 136 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits 137 pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits 138 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits 139 pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits 140 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2*pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits 141 pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits 142 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3*pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits 143 pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits 144 145 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13 146 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33 147 148 temp4 = _mm_add_epi16(value_add, pred_r0); 149 temp5 = _mm_add_epi16(value_add, pred_r2); 150 /*------------------------------------------------------------------*/ 151 //Clipping the results to 8 bits 152 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check 153 temp4 = _mm_and_si128(temp4, sign_reg); 154 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check 155 temp5 = _mm_and_si128(temp5, sign_reg); 156 157 temp4 = _mm_packus_epi16(temp4,temp5); 158 temp5 = _mm_srli_si128(temp4,4); 159 temp6 = _mm_srli_si128(temp5,4); 160 temp7 = _mm_srli_si128(temp6,4); 161 162 *pu4_out = _mm_cvtsi128_si32(temp4); 163 pu1_out += out_strd; 164 pu4_out = (UWORD32 *)(pu1_out); 165 *(pu4_out) = _mm_cvtsi128_si32(temp5); 166 pu1_out += out_strd; 167 pu4_out = (UWORD32 *)(pu1_out); 168 *(pu4_out) = _mm_cvtsi128_si32(temp6); 169 pu1_out += out_strd; 170 pu4_out = (UWORD32 *)(pu1_out); 171 *(pu4_out) = _mm_cvtsi128_si32(temp7); 172 } 173 /** 174 ******************************************************************************* 175 * 176 * @brief 177 * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block 178 * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is 179 * non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c 180 * 181 * @par Description: 182 * Performs inverse transform Ci8 and adds the residue to get the 183 * reconstructed block 184 * 185 * @param[in] pi2_src 186 * Input 8x8coefficients 187 * 188 * @param[in] pu1_pred 189 * Prediction 8x8 block 190 * 191 * @param[out] pu1_recon 192 * Output 8x8 block 193 * 194 * @param[in] q_div 195 * QP/6 196 * 197 * @param[in] q_rem 198 * QP%6 199 * 200 * @param[in] q_lev 201 * Quantizer level 202 * 203 * @param[in] u4_src_stride 204 * Input stride 205 * 206 * @param[in] u4_pred_stride, 207 * Prediction stride 208 * 209 * @param[in] u4_out_stride 210 * Output Stride 211 * 212 * @param[in] pi4_tmp 213 * temporary buffer of size 1*64 214 * the tmp for each block 215 * 216 * @param[in] pu4_iquant_mat 217 * Pointer to the inverse quantization matrix 218 * 219 * @returns Void 220 * 221 * @remarks 222 * None 223 * 224 ******************************************************************************* 225 */ 226 227 void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src, 228 UWORD8 *pu1_pred, 229 UWORD8 *pu1_out, 230 WORD32 pred_strd, 231 WORD32 out_strd, 232 const UWORD16 *pu2_iscale_mat, 233 const UWORD16 *pu2_weigh_mat, 234 UWORD32 qp_div, 235 WORD16 *pi2_tmp, 236 WORD32 iq_start_idx, 237 WORD16 *pi2_dc_ld_addr) 238 { 239 WORD32 q0 = pi2_src[0]; 240 WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0; 241 242 __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7; 243 __m128i sign_reg; 244 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 245 __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8; 246 __m128i value_add; 247 248 UNUSED (pi2_tmp); 249 UNUSED (iq_start_idx); 250 UNUSED (pi2_dc_ld_addr); 251 252 INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6); 253 i_macro = ((q0 + 32) >> 6); 254 255 value_add = _mm_set1_epi16(i_macro); 256 257 //Load pred buffer row 0 258 predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 259 pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 260 //Load pred buffer row 1 261 predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 262 pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 263 //Load pred buffer row 2 264 predload_r = _mm_loadl_epi64( 265 (__m128i *)(&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 266 pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 267 //Load pred buffer row 3 268 predload_r = _mm_loadl_epi64( 269 (__m128i *)(&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 270 pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 271 //Load pred buffer row 4 272 predload_r = _mm_loadl_epi64( 273 (__m128i *)(&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 274 pred_r4 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 275 //Load pred buffer row 5 276 predload_r = _mm_loadl_epi64( 277 (__m128i *)(&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit 278 pred_r5 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 279 //Load pred buffer row 6 280 predload_r = _mm_loadl_epi64( 281 (__m128i *)(&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 282 pred_r6 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 283 //Load pred buffer row 7 284 predload_r = _mm_loadl_epi64( 285 (__m128i *)(&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits 286 pred_r7 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits 287 288 temp1 = _mm_add_epi16(value_add, pred_r0); 289 290 temp2 = _mm_add_epi16(value_add, pred_r1); 291 292 temp3 = _mm_add_epi16(value_add, pred_r2); 293 294 temp4 = _mm_add_epi16(value_add, pred_r3); 295 296 temp5 = _mm_add_epi16(value_add, pred_r4); 297 298 temp6 = _mm_add_epi16(value_add, pred_r5); 299 300 temp7 = _mm_add_epi16(value_add, pred_r6); 301 302 temp8 = _mm_add_epi16(value_add, pred_r7); 303 /*------------------------------------------------------------------*/ 304 //Clipping the results to 8 bits 305 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check 306 temp1 = _mm_and_si128(temp1, sign_reg); 307 sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check 308 temp2 = _mm_and_si128(temp2, sign_reg); 309 sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check 310 temp3 = _mm_and_si128(temp3, sign_reg); 311 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check 312 temp4 = _mm_and_si128(temp4, sign_reg); 313 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check 314 temp5 = _mm_and_si128(temp5, sign_reg); 315 sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check 316 temp6 = _mm_and_si128(temp6, sign_reg); 317 sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check 318 temp7 = _mm_and_si128(temp7, sign_reg); 319 sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check 320 temp8 = _mm_and_si128(temp8, sign_reg); 321 322 temp1 = _mm_packus_epi16(temp1, zero_8x16b); 323 temp2 = _mm_packus_epi16(temp2, zero_8x16b); 324 temp3 = _mm_packus_epi16(temp3, zero_8x16b); 325 temp4 = _mm_packus_epi16(temp4, zero_8x16b); 326 temp5 = _mm_packus_epi16(temp5, zero_8x16b); 327 temp6 = _mm_packus_epi16(temp6, zero_8x16b); 328 temp7 = _mm_packus_epi16(temp7, zero_8x16b); 329 temp8 = _mm_packus_epi16(temp8, zero_8x16b); 330 331 _mm_storel_epi64((__m128i *)(&pu1_out[0]), temp1); 332 _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), temp2); 333 _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), temp3); 334 _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), temp4); 335 _mm_storel_epi64((__m128i *)(&pu1_out[4 * out_strd]), temp5); 336 _mm_storel_epi64((__m128i *)(&pu1_out[5 * out_strd]), temp6); 337 _mm_storel_epi64((__m128i *)(&pu1_out[6 * out_strd]), temp7); 338 _mm_storel_epi64((__m128i *)(&pu1_out[7 * out_strd]), temp8); 339 } 340 341 /* 342 ******************************************************************************** 343 * 344 * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and 345 * prediction buffer 346 * 347 * @par Description: 348 * The quantized residue is first inverse quantized, then inverse transformed. 349 * This inverse transformed content is added to the prediction buffer to recon- 350 * struct the end output 351 * 352 * @param[in] pi2_src 353 * quantized 4x4 block 354 * 355 * @param[in] pu1_pred 356 * prediction 4x4 block 357 * 358 * @param[out] pu1_out 359 * reconstructed 4x4 block 360 * 361 * @param[in] src_strd 362 * quantization buffer stride 363 * 364 * @param[in] pred_strd, 365 * Prediction buffer stride 366 * 367 * @param[in] out_strd 368 * recon buffer Stride 369 * 370 * @param[in] pu2_scaling_list 371 * pointer to scaling list 372 * 373 * @param[in] pu2_norm_adjust 374 * pointer to inverse scale matrix 375 * 376 * @param[in] u4_qp_div_6 377 * Floor (qp/6) 378 * 379 * @param[in] pi4_tmp 380 * temporary buffer of size 1*16 381 * 382 * @returns none 383 * 384 * @remarks none 385 * 386 ******************************************************************************* 387 */ 388 void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src, 389 UWORD8 *pu1_pred, 390 UWORD8 *pu1_out, 391 WORD32 pred_strd, 392 WORD32 out_strd, 393 const UWORD16 *pu2_iscal_mat, 394 const UWORD16 *pu2_weigh_mat, 395 UWORD32 u4_qp_div_6, 396 WORD16 *pi2_tmp, 397 WORD16 *pi2_dc_src) 398 { 399 WORD16 q0 = pi2_dc_src[0]; // DC value won't be dequantized for chroma inverse transform 400 WORD16 i_macro = ((q0 + 32) >> 6); 401 402 __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg; 403 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 404 __m128i chroma_mask = _mm_set1_epi16 (0xFF); 405 __m128i value_add = _mm_set1_epi16(i_macro); 406 __m128i out_r0, out_r1, out_r2, out_r3; 407 408 UNUSED (pi2_src); 409 UNUSED (pu2_iscal_mat); 410 UNUSED (pu2_weigh_mat); 411 UNUSED (u4_qp_div_6); 412 UNUSED (pi2_tmp); 413 414 //Load pred buffer 415 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits 416 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits 417 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits 418 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits 419 420 pred_r0 = _mm_and_si128(pred_r0, chroma_mask); 421 pred_r1 = _mm_and_si128(pred_r1, chroma_mask); 422 pred_r2 = _mm_and_si128(pred_r2, chroma_mask); 423 pred_r3 = _mm_and_si128(pred_r3, chroma_mask); 424 425 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13 426 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33 427 428 pred_r0 = _mm_add_epi16(value_add, pred_r0); 429 pred_r2 = _mm_add_epi16(value_add, pred_r2); 430 431 /*------------------------------------------------------------------*/ 432 //Clipping the results to 8 bits 433 sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b); // sign check 434 pred_r0 = _mm_and_si128(pred_r0, sign_reg); 435 sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b); 436 pred_r2 = _mm_and_si128(pred_r2, sign_reg); 437 438 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2); 439 pred_r1 = _mm_srli_si128(pred_r0, 4); 440 pred_r2 = _mm_srli_si128(pred_r1, 4); 441 pred_r3 = _mm_srli_si128(pred_r2, 4); 442 443 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- all 16 bits 444 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- all 16 bits 445 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits 446 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits 447 448 chroma_mask = _mm_set1_epi16 (0xFF00); 449 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0])); 450 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd])); 451 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd])); 452 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd])); 453 454 out_r0 = _mm_and_si128(out_r0, chroma_mask); 455 out_r1 = _mm_and_si128(out_r1, chroma_mask); 456 out_r2 = _mm_and_si128(out_r2, chroma_mask); 457 out_r3 = _mm_and_si128(out_r3, chroma_mask); 458 459 out_r0 = _mm_add_epi8(out_r0, pred_r0); 460 out_r1 = _mm_add_epi8(out_r1, pred_r1); 461 out_r2 = _mm_add_epi8(out_r2, pred_r2); 462 out_r3 = _mm_add_epi8(out_r3, pred_r3); 463 464 _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0); 465 _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1); 466 _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2); 467 _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3); 468 } 469 470 471