1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ******************************************************************************* 22 * @file 23 * ih264_resi_trans_quant_sse42.c 24 * 25 * @brief 26 * Contains function definitions single stage forward transform for H.264 27 * It will calculate the residue, do the cf and then do quantization 28 * 29 * @author 30 * Mohit [100664] 31 * 32 * @par List of Functions: 33 * - ih264_resi_trans_quant_4x4_sse42() 34 * - ih264_resi_trans_quant_chroma_4x4_sse42() 35 * 36 * @remarks 37 * None 38 * 39 ******************************************************************************* 40 */ 41 /* System include files */ 42 #include <stddef.h> 43 44 /* User include files */ 45 #include "ih264_typedefs.h" 46 #include "ih264_defs.h" 47 #include "ih264_size_defs.h" 48 #include "ih264_macros.h" 49 #include "ih264_trans_macros.h" 50 #include "ih264_trans_data.h" 51 #include "ih264_structs.h" 52 #include "ih264_trans_quant_itrans_iquant.h" 53 #include <immintrin.h> 54 /** 55 ******************************************************************************* 56 * 57 * @brief 58 * This function performs forward transform and quantization on a 4*4 block 59 * 60 * @par Description: 61 * The function accepts source buffer and estimation buffer. From these, it 62 * computes the residue. This is residue is then transformed and quantized. 63 * The transform and quantization are in placed computed. They use the residue 64 * buffer for this. 65 * 66 * @param[in] pu1_src 67 * Pointer to source sub-block 68 * 69 * @param[in] pu1_pred 70 * Pointer to prediction sub-block 71 * 72 * @param[in] pi2_out 73 * Pointer to residual sub-block 74 * 75 * @param[in] src_strd 76 * Source stride 77 * 78 * @param[in] pred_strd 79 * Prediction stride 80 * 81 * @param[in] dst_strd 82 * Destination stride 83 * 84 * @param[in] u4_qbits 85 * QP_BITS_h264_4x4 + floor(QP/6) 86 * 87 * @param[in] pu2_threshold_matrix 88 * Pointer to Forward Quant Threshold Matrix 89 * 90 * @param[in] pu2_scale_matrix 91 * Pointer to Forward Quant Scale Matrix 92 * 93 * @param[in] u4_round_factor 94 * Quantization Round factor 95 * 96 * @param[out] pu1_nnz 97 * Total non-zero coefficients in the current sub-block 98 * 99 * @returns 100 * 101 * @remarks 102 * None 103 * 104 ******************************************************************************* 105 */ 106 void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred, 107 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd, 108 const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, 109 UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz, 110 WORD16 *pi2_alt_dc_addr) 111 { 112 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; 113 WORD32 mask0, mask1; 114 __m128i sum0, sum1, sum2, cmp0, cmp1; 115 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 116 __m128i temp_2 = _mm_set1_epi16(2); 117 __m128i temp_1 = _mm_set1_epi16(1); 118 __m128i src_r0, src_r1, src_r2, src_r3; 119 __m128i pred_r0, pred_r1, pred_r2, pred_r3; 120 __m128i temp0, temp1, temp2, temp3; 121 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 122 __m128i sign_reg0, sign_reg2; 123 __m128i scalemat_r0_r1, scalemat_r2_r3; 124 125 UNUSED (pu2_threshold_matrix); 126 127 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row 128 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row 129 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits 130 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits 131 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits 132 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits 133 134 src_r0 = _mm_cvtepu8_epi16(src_r0); 135 src_r1 = _mm_cvtepu8_epi16(src_r1); 136 src_r2 = _mm_cvtepu8_epi16(src_r2); 137 src_r3 = _mm_cvtepu8_epi16(src_r3); 138 139 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits 140 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits 141 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits 142 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits 143 144 pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits 145 pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits 146 pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits 147 pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits 148 149 src_r0 = _mm_sub_epi16(src_r0, pred_r0); 150 src_r1 = _mm_sub_epi16(src_r1, pred_r1); 151 src_r2 = _mm_sub_epi16(src_r2, pred_r2); 152 src_r3 = _mm_sub_epi16(src_r3, pred_r3); 153 154 /* Perform Forward transform */ 155 /*-------------------------------------------------------------*/ 156 /* DCT [ Horizontal transformation ] */ 157 /*-------------------------------------------------------------*/ 158 // Matrix transpose 159 /* 160 * a0 a1 a2 a3 161 * b0 b1 b2 b3 162 * c0 c1 c2 c3 163 * d0 d1 d2 d3 164 */ 165 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 166 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 167 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 168 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 169 170 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 171 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 172 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 173 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 174 175 /*----------------------------------------------------------*/ 176 /* x0 = z0 + z3 */ 177 temp0 = _mm_add_epi16(src_r0, src_r3); 178 /* x1 = z1 + z2 */ 179 temp1 = _mm_add_epi16(src_r1, src_r2); 180 /* x2 = z1 - z2 */ 181 temp2 = _mm_sub_epi16(src_r1, src_r2); 182 /* x3 = z0 - z3 */ 183 temp3 = _mm_sub_epi16(src_r0, src_r3); 184 185 /* z0 = x0 + x1 */ 186 src_r0 = _mm_add_epi16(temp0, temp1); 187 /* z1 = (x3 << 1) + x2 */ 188 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 189 src_r1 = _mm_add_epi16(src_r1, temp2); 190 /* z2 = x0 - x1 */ 191 src_r2 = _mm_sub_epi16(temp0, temp1); 192 /* z3 = x3 - (x2 << 1) */ 193 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 194 src_r3 = _mm_sub_epi16(temp3, src_r3); 195 196 // Matrix transpose 197 /* 198 * a0 b0 c0 d0 199 * a1 b1 c1 d1 200 * a2 b2 c2 d2 201 * a3 b3 c3 d3 202 */ 203 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 204 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 205 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 206 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 207 208 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 209 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 210 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 211 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 212 213 /*----------------------------------------------------------*/ 214 /* x0 = z0 + z3 */ 215 temp0 = _mm_add_epi16(src_r0, src_r3); 216 /* x1 = z1 + z2 */ 217 temp1 = _mm_add_epi16(src_r1, src_r2); 218 /* x2 = z1 - z2 */ 219 temp2 = _mm_sub_epi16(src_r1, src_r2); 220 /* x3 = z0 - z3 */ 221 temp3 = _mm_sub_epi16(src_r0, src_r3); 222 223 /* z0 = x0 + x1 */ 224 src_r0 = _mm_add_epi16(temp0, temp1); 225 /* z1 = (x3 << 1) + x2 */ 226 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 227 src_r1 = _mm_add_epi16(src_r1, temp2); 228 /* z2 = x0 - x1 */ 229 src_r2 = _mm_sub_epi16(temp0, temp1); 230 /* z3 = x3 - (x2 << 1) */ 231 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 232 src_r3 = _mm_sub_epi16(temp3, src_r3); 233 234 tmp_dc = _mm_extract_epi16(src_r0,0); //a0 235 *pi2_alt_dc_addr = tmp_dc; 236 237 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 238 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 239 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); 240 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); 241 242 sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); 243 sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); 244 245 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); 246 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); 247 248 src_r0 = _mm_abs_epi16(src_r0); 249 src_r2 = _mm_abs_epi16(src_r2); 250 251 src_r1 = _mm_srli_si128(src_r0, 8); 252 src_r0 = _mm_cvtepu16_epi32(src_r0); 253 src_r1 = _mm_cvtepu16_epi32(src_r1); 254 src_r3 = _mm_srli_si128(src_r2, 8); 255 src_r2 = _mm_cvtepu16_epi32(src_r2); 256 src_r3 = _mm_cvtepu16_epi32(src_r3); 257 258 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); 259 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); 260 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); 261 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); 262 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); 263 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); 264 265 temp0 = _mm_mullo_epi32(temp0, src_r0); 266 temp1 = _mm_mullo_epi32(temp1, src_r1); 267 temp2 = _mm_mullo_epi32(temp2, src_r2); 268 temp3 = _mm_mullo_epi32(temp3, src_r3); 269 270 temp0 = _mm_add_epi32(temp0,rnd_fact); 271 temp1 = _mm_add_epi32(temp1,rnd_fact); 272 temp2 = _mm_add_epi32(temp2,rnd_fact); 273 temp3 = _mm_add_epi32(temp3,rnd_fact); 274 275 temp0 = _mm_srli_epi32(temp0,u4_qbits); 276 temp1 = _mm_srli_epi32(temp1,u4_qbits); 277 temp2 = _mm_srli_epi32(temp2,u4_qbits); 278 temp3 = _mm_srli_epi32(temp3,u4_qbits); 279 280 temp0 = _mm_packs_epi32 (temp0,temp1); 281 temp2 = _mm_packs_epi32 (temp2,temp3); 282 283 temp0 = _mm_sign_epi16(temp0, sign_reg0); 284 temp2 = _mm_sign_epi16(temp2, sign_reg2); 285 286 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); 287 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); 288 289 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); 290 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); 291 292 mask0 = _mm_movemask_epi8(cmp0); 293 mask1 = _mm_movemask_epi8(cmp1); 294 u4_zero_coeff = 0; 295 if(mask0) 296 { 297 if(mask0 == 0xffff) 298 u4_zero_coeff+=8; 299 else 300 { 301 cmp0 = _mm_and_si128(temp_1, cmp0); 302 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 303 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 304 sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 305 u4_zero_coeff += _mm_cvtsi128_si32(sum2); 306 } 307 } 308 if(mask1) 309 { 310 if(mask1 == 0xffff) 311 u4_zero_coeff+=8; 312 else 313 { 314 cmp1 = _mm_and_si128(temp_1, cmp1); 315 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 316 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 317 sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 318 u4_zero_coeff += _mm_cvtsi128_si32(sum2); 319 } 320 } 321 322 /* Return total nonzero coefficients in the current sub block */ 323 u4_nonzero_coeff = 16 - u4_zero_coeff; 324 *pu1_nnz = u4_nonzero_coeff; 325 } 326 327 /** 328 ******************************************************************************* 329 * 330 * @brief 331 * This function performs forward transform and quantization on a 4*4 chroma block 332 * 333 * @par Description: 334 * The function accepts source buffer and estimation buffer. From these, it 335 * computes the residue. This is residue is then transformed and quantized. 336 * The transform and quantization are in placed computed. They use the residue 337 * buffer for this. 338 * 339 * @param[in] pu1_src 340 * Pointer to source sub-block 341 * 342 * @param[in] pu1_pred 343 * Pointer to prediction sub-block 344 * 345 * @param[in] pi2_out 346 * Pointer to residual sub-block 347 * 348 * @param[in] src_strd 349 * Source stride 350 * 351 * @param[in] pred_strd 352 * Prediction stride 353 * 354 * @param[in] dst_strd 355 * Destination stride 356 * 357 * @param[in] u4_qbits 358 * QP_BITS_h264_4x4 + floor(QP/6) 359 * 360 * @param[in] pu2_threshold_matrix 361 * Pointer to Forward Quant Threshold Matrix 362 * 363 * @param[in] pu2_scale_matrix 364 * Pointer to Forward Quant Scale Matrix 365 * 366 * @param[in] u4_round_factor 367 * Quantization Round factor 368 * 369 * @param[out] pu1_nnz 370 * Total non-zero coefficients in the current sub-block 371 * 372 * @returns 373 * 374 * @remarks 375 * None 376 * 377 ******************************************************************************* 378 */ 379 void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out, 380 WORD32 src_strd,WORD32 pred_strd, 381 const UWORD16 *pu2_scale_matrix, 382 const UWORD16 *pu2_threshold_matrix, 383 UWORD32 u4_qbits,UWORD32 u4_round_factor, 384 UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr) 385 { 386 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; 387 WORD32 mask0, mask1; 388 __m128i cmp0, cmp1, sum0, sum1, sum2; 389 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 390 __m128i temp_2 = _mm_set1_epi16(2); 391 __m128i temp_1 = _mm_set1_epi16(1); 392 __m128i src_r0, src_r1, src_r2, src_r3; 393 __m128i pred_r0, pred_r1, pred_r2, pred_r3; 394 __m128i temp0, temp1, temp2, temp3; 395 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 396 __m128i sign_reg0, sign_reg2; 397 __m128i scalemat_r0_r1, scalemat_r2_r3; 398 __m128i chroma_mask = _mm_set1_epi16 (0xFF); 399 400 UNUSED (pu2_threshold_matrix); 401 402 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row 403 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row 404 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits 405 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits 406 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits 407 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits 408 409 src_r0 = _mm_and_si128(src_r0, chroma_mask); 410 src_r1 = _mm_and_si128(src_r1, chroma_mask); 411 src_r2 = _mm_and_si128(src_r2, chroma_mask); 412 src_r3 = _mm_and_si128(src_r3, chroma_mask); 413 // src_r0 = _mm_cvtepu8_epi16(src_r0); 414 // src_r1 = _mm_cvtepu8_epi16(src_r1); 415 // src_r2 = _mm_cvtepu8_epi16(src_r2); 416 // src_r3 = _mm_cvtepu8_epi16(src_r3); 417 418 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits 419 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits 420 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits 421 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits 422 423 pred_r0 = _mm_and_si128(pred_r0, chroma_mask); 424 pred_r1 = _mm_and_si128(pred_r1, chroma_mask); 425 pred_r2 = _mm_and_si128(pred_r2, chroma_mask); 426 pred_r3 = _mm_and_si128(pred_r3, chroma_mask); 427 // pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits 428 // pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits 429 // pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits 430 // pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits 431 432 src_r0 = _mm_sub_epi16(src_r0, pred_r0); 433 src_r1 = _mm_sub_epi16(src_r1, pred_r1); 434 src_r2 = _mm_sub_epi16(src_r2, pred_r2); 435 src_r3 = _mm_sub_epi16(src_r3, pred_r3); 436 437 /* Perform Forward transform */ 438 /*-------------------------------------------------------------*/ 439 /* DCT [ Horizontal transformation ] */ 440 /*-------------------------------------------------------------*/ 441 // Matrix transpose 442 /* 443 * a0 a1 a2 a3 444 * b0 b1 b2 b3 445 * c0 c1 c2 c3 446 * d0 d1 d2 d3 447 */ 448 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 449 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 450 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 451 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 452 453 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 454 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 455 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 456 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 457 458 /*----------------------------------------------------------*/ 459 /* x0 = z0 + z3 */ 460 temp0 = _mm_add_epi16(src_r0, src_r3); 461 /* x1 = z1 + z2 */ 462 temp1 = _mm_add_epi16(src_r1, src_r2); 463 /* x2 = z1 - z2 */ 464 temp2 = _mm_sub_epi16(src_r1, src_r2); 465 /* x3 = z0 - z3 */ 466 temp3 = _mm_sub_epi16(src_r0, src_r3); 467 468 /* z0 = x0 + x1 */ 469 src_r0 = _mm_add_epi16(temp0, temp1); 470 /* z1 = (x3 << 1) + x2 */ 471 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 472 src_r1 = _mm_add_epi16(src_r1, temp2); 473 /* z2 = x0 - x1 */ 474 src_r2 = _mm_sub_epi16(temp0, temp1); 475 /* z3 = x3 - (x2 << 1) */ 476 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 477 src_r3 = _mm_sub_epi16(temp3, src_r3); 478 479 // Matrix transpose 480 /* 481 * a0 b0 c0 d0 482 * a1 b1 c1 d1 483 * a2 b2 c2 d2 484 * a3 b3 c3 d3 485 */ 486 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 487 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 488 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 489 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 490 491 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 492 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 493 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 494 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 495 496 /*----------------------------------------------------------*/ 497 /* x0 = z0 + z3 */ 498 temp0 = _mm_add_epi16(src_r0, src_r3); 499 /* x1 = z1 + z2 */ 500 temp1 = _mm_add_epi16(src_r1, src_r2); 501 /* x2 = z1 - z2 */ 502 temp2 = _mm_sub_epi16(src_r1, src_r2); 503 /* x3 = z0 - z3 */ 504 temp3 = _mm_sub_epi16(src_r0, src_r3); 505 506 /* z0 = x0 + x1 */ 507 src_r0 = _mm_add_epi16(temp0, temp1); 508 /* z1 = (x3 << 1) + x2 */ 509 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 510 src_r1 = _mm_add_epi16(src_r1, temp2); 511 /* z2 = x0 - x1 */ 512 src_r2 = _mm_sub_epi16(temp0, temp1); 513 /* z3 = x3 - (x2 << 1) */ 514 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 515 src_r3 = _mm_sub_epi16(temp3, src_r3); 516 517 tmp_dc = _mm_extract_epi16(src_r0,0); //a0 518 *pi2_alt_dc_addr = tmp_dc; 519 520 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 521 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 522 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); 523 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); 524 525 sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); 526 sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); 527 528 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); 529 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); 530 531 src_r0 = _mm_abs_epi16(src_r0); 532 src_r2 = _mm_abs_epi16(src_r2); 533 534 src_r1 = _mm_srli_si128(src_r0, 8); 535 src_r0 = _mm_cvtepu16_epi32(src_r0); 536 src_r1 = _mm_cvtepu16_epi32(src_r1); 537 src_r3 = _mm_srli_si128(src_r2, 8); 538 src_r2 = _mm_cvtepu16_epi32(src_r2); 539 src_r3 = _mm_cvtepu16_epi32(src_r3); 540 541 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); 542 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); 543 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); 544 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); 545 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); 546 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); 547 548 temp0 = _mm_mullo_epi32(temp0, src_r0); 549 temp1 = _mm_mullo_epi32(temp1, src_r1); 550 temp2 = _mm_mullo_epi32(temp2, src_r2); 551 temp3 = _mm_mullo_epi32(temp3, src_r3); 552 553 temp0 = _mm_add_epi32(temp0,rnd_fact); 554 temp1 = _mm_add_epi32(temp1,rnd_fact); 555 temp2 = _mm_add_epi32(temp2,rnd_fact); 556 temp3 = _mm_add_epi32(temp3,rnd_fact); 557 558 temp0 = _mm_srli_epi32(temp0,u4_qbits); 559 temp1 = _mm_srli_epi32(temp1,u4_qbits); 560 temp2 = _mm_srli_epi32(temp2,u4_qbits); 561 temp3 = _mm_srli_epi32(temp3,u4_qbits); 562 563 temp0 = _mm_packs_epi32 (temp0,temp1); 564 temp2 = _mm_packs_epi32 (temp2,temp3); 565 566 temp0 = _mm_sign_epi16(temp0, sign_reg0); 567 temp2 = _mm_sign_epi16(temp2, sign_reg2); 568 569 //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0); 570 571 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); 572 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); 573 574 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); 575 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); 576 577 mask0 = _mm_movemask_epi8(cmp0); 578 mask1 = _mm_movemask_epi8(cmp1); 579 u4_zero_coeff = 0; 580 if(mask0) 581 { 582 if(mask0 == 0xffff) 583 u4_zero_coeff+=8; 584 else 585 { 586 cmp0 = _mm_and_si128(temp_1, cmp0); 587 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 588 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 589 sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 590 u4_zero_coeff += _mm_cvtsi128_si32(sum2); 591 } 592 } 593 if(mask1) 594 { 595 if(mask1 == 0xffff) 596 u4_zero_coeff+=8; 597 else 598 { 599 cmp1 = _mm_and_si128(temp_1, cmp1); 600 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 601 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 602 sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 603 u4_zero_coeff += _mm_cvtsi128_si32(sum2); 604 } 605 } 606 607 /* Return total nonzero coefficients in the current sub block */ 608 u4_nonzero_coeff = 16 - u4_zero_coeff; 609 *pu1_nnz = u4_nonzero_coeff; 610 611 } 612 613 614 /** 615 ******************************************************************************* 616 * 617 * @brief 618 * This function performs forward hadamard transform and quantization on a 4*4 block 619 * 620 * @par Description: 621 * The function accepts source buffer and estimation buffer. From these, it 622 * computes the residue. This is residue is then transformed and quantized. 623 * The transform and quantization are in placed computed. They use the residue 624 * buffer for this. 625 * 626 * @param[in] pu1_src 627 * Pointer to source sub-block 628 * 629 * @param[in] pu1_pred 630 * Pointer to prediction sub-block 631 * 632 * @param[in] pi2_out 633 * Pointer to residual sub-block 634 * 635 * @param[in] src_strd 636 * Source stride 637 * 638 * @param[in] pred_strd 639 * Prediction stride 640 * 641 * @param[in] dst_strd 642 * Destination stride 643 * 644 * @param[in] u4_qbits 645 * QP_BITS_h264_4x4 + floor(QP/6) 646 * 647 * @param[in] pu2_threshold_matrix 648 * Pointer to Forward Quant Threshold Matrix 649 * 650 * @param[in] pu2_scale_matrix 651 * Pointer to Forward Quant Scale Matrix 652 * 653 * @param[in] u4_round_factor 654 * Quantization Round factor 655 * 656 * @param[out] pu1_nnz 657 * Total non-zero coefficients in the current sub-block 658 * 659 * @returns 660 * 661 * @remarks 662 * None 663 * 664 */ 665 666 void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, 667 const UWORD16 *pu2_scale_matrix, 668 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, 669 UWORD32 u4_round_factor,UWORD8 *pu1_nnz 670 ) 671 { 672 WORD32 u4_zero_coeff,u4_nonzero_coeff=0; 673 __m128i cmp0, cmp1, sum0, sum1, sum2; 674 WORD32 mask0, mask1; 675 __m128i src_r0_r1, src_r2_r3, sign_reg; 676 __m128i src_r0, src_r1, src_r2, src_r3; 677 __m128i zero_8x16b = _mm_setzero_si128(); 678 __m128i temp0, temp1, temp2, temp3; 679 __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3; 680 __m128i temp_1 = _mm_set1_epi16(1); 681 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 682 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); 683 684 UNUSED (pu2_threshold_matrix); 685 686 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row 687 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row 688 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); 689 src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3 690 src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3 691 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); 692 src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3 693 src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3 694 695 /* Perform Inverse transform */ 696 /*-------------------------------------------------------------*/ 697 /* Forward DC transform [ Horizontal transformation ] */ 698 /*-------------------------------------------------------------*/ 699 // Matrix transpose 700 /* 701 * a0 a1 a2 a3 702 * b0 b1 b2 b3 703 * c0 c1 c2 c3 704 * d0 d1 d2 d3 705 */ 706 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 707 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 708 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 709 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 710 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 711 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 712 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 713 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 714 715 temp0 = _mm_add_epi32(src_r0, src_r3); 716 temp1 = _mm_add_epi32(src_r1, src_r2); 717 temp2 = _mm_sub_epi32(src_r1, src_r2); 718 temp3 = _mm_sub_epi32(src_r0, src_r3); 719 720 src_r0 = _mm_add_epi32(temp0, temp1); 721 src_r1 = _mm_add_epi32(temp2, temp3); 722 src_r2 = _mm_sub_epi32(temp0, temp1); 723 src_r3 = _mm_sub_epi32(temp3, temp2); 724 725 /*-------------------------------------------------------------*/ 726 /* Forward DC transform [ Vertical transformation ] */ 727 /*-------------------------------------------------------------*/ 728 // Matrix transpose 729 /* 730 * a0 b0 c0 d0 731 * a1 b1 c1 d1 732 * a2 b2 c2 d2 733 * a3 b3 c3 d3 734 */ 735 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 736 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 737 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 738 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 739 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 740 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 741 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 742 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 743 744 temp0 = _mm_add_epi32(src_r0, src_r3); 745 temp1 = _mm_add_epi32(src_r1, src_r2); 746 temp2 = _mm_sub_epi32(src_r1, src_r2); 747 temp3 = _mm_sub_epi32(src_r0, src_r3); 748 749 src_r0 = _mm_add_epi32(temp0, temp1); 750 src_r1 = _mm_add_epi32(temp2, temp3); 751 src_r2 = _mm_sub_epi32(temp0, temp1); 752 src_r3 = _mm_sub_epi32(temp3, temp2); 753 754 src_r0 = _mm_srai_epi32(src_r0, 1); 755 src_r1 = _mm_srai_epi32(src_r1, 1); 756 src_r2 = _mm_srai_epi32(src_r2, 1); 757 src_r3 = _mm_srai_epi32(src_r3, 1); 758 759 // Quantization 760 sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration 761 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1); 762 sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2); 763 sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3); 764 765 sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively 766 sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3); 767 768 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively 769 sign_reg2 = _mm_slli_epi16(sign_reg2, 1); 770 771 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively 772 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); 773 774 src_r0 = _mm_abs_epi32(src_r0); //Absolute values 775 src_r1 = _mm_abs_epi32(src_r1); 776 src_r2 = _mm_abs_epi32(src_r2); 777 src_r3 = _mm_abs_epi32(src_r3); 778 779 temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0] 780 temp1 = _mm_mullo_epi32(scale_val, src_r1); 781 temp2 = _mm_mullo_epi32(scale_val, src_r2); 782 temp3 = _mm_mullo_epi32(scale_val, src_r3); 783 784 temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor 785 temp1 = _mm_add_epi32(temp1,rnd_fact); 786 temp2 = _mm_add_epi32(temp2,rnd_fact); 787 temp3 = _mm_add_epi32(temp3,rnd_fact); 788 789 temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works 790 temp1 = _mm_srli_epi32(temp1,u4_qbits); 791 temp2 = _mm_srli_epi32(temp2,u4_qbits); 792 temp3 = _mm_srli_epi32(temp3,u4_qbits); 793 794 temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. 795 temp2 = _mm_packs_epi32 (temp2,temp3); 796 797 temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration 798 temp2 = _mm_sign_epi16(temp2, sign_reg2); 799 800 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); 801 _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2); 802 803 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); 804 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); 805 806 mask0 = _mm_movemask_epi8(cmp0); 807 mask1 = _mm_movemask_epi8(cmp1); 808 u4_zero_coeff = 0; 809 if(mask0) 810 { 811 if(mask0 == 0xffff) 812 u4_zero_coeff+=8; 813 else 814 { 815 cmp0 = _mm_and_si128(temp_1, cmp0); 816 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 817 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 818 sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 819 u4_zero_coeff += _mm_cvtsi128_si32(sum2); 820 } 821 } 822 if(mask1) 823 { 824 if(mask1 == 0xffff) 825 u4_zero_coeff+=8; 826 else 827 { 828 cmp1 = _mm_and_si128(temp_1, cmp1); 829 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 830 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 831 sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 832 u4_zero_coeff += _mm_cvtsi128_si32(sum2); 833 } 834 } 835 836 /* Return total nonzero coefficients in the current sub block */ 837 u4_nonzero_coeff = 16 - u4_zero_coeff; 838 pu1_nnz[0] = u4_nonzero_coeff; 839 } 840 841 842 /** 843 ******************************************************************************* 844 * 845 * @brief 846 * This function performs forward hadamard transform and quantization on a 2*2 block 847 * for both U and V planes 848 * 849 * @par Description: 850 * The function accepts source buffer and estimation buffer. From these, it 851 * computes the residue. This is residue is then transformed and quantized. 852 * The transform and quantization are in placed computed. They use the residue 853 * buffer for this. 854 * 855 * @param[in] pu1_src 856 * Pointer to source sub-block 857 * 858 * @param[in] pu1_pred 859 * Pointer to prediction sub-block 860 * 861 * @param[in] pi2_out 862 * Pointer to residual sub-block 863 * 864 * @param[in] src_strd 865 * Source stride 866 * 867 * @param[in] pred_strd 868 * Prediction stride 869 * 870 * @param[in] dst_strd 871 * Destination stride 872 * 873 * @param[in] u4_qbits 874 * QP_BITS_h264_4x4 + floor(QP/6) 875 * 876 * @param[in] pu2_threshold_matrix 877 * Pointer to Forward Quant Threshold Matrix 878 * 879 * @param[in] pu2_scale_matrix 880 * Pointer to Forward Quant Scale Matrix 881 * 882 * @param[in] u4_round_factor 883 * Quantization Round factor 884 * 885 * @param[out] pu1_nnz 886 * Total non-zero coefficients in the current sub-block 887 * 888 * @returns 889 * 890 * @remarks 891 * NNZ for dc is populated at 0 and 5th position of pu1_nnz 892 * 893 */ 894 895 void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, 896 const UWORD16 *pu2_scale_matrix, 897 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, 898 UWORD32 u4_round_factor,UWORD8 *pu1_nnz) 899 { 900 WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0; 901 __m128i cmp, cmp0, cmp1; 902 __m128i sum0, sum1; 903 WORD32 mask, mask0, mask1; 904 __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; 905 __m128i zero_8x16b = _mm_setzero_si128(); 906 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); 907 __m128i sign_reg0, sign_reg1; 908 __m128i temp_1 = _mm_set1_epi16(1); 909 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 910 911 UNUSED (pu2_threshold_matrix); 912 913 src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 914 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); 915 plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits 916 plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits 917 918 temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 919 temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 920 921 plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 922 plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 923 924 temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 925 temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 926 927 plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 928 plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 929 930 plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 931 plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 932 // Quantization 933 sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration 934 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1); 935 936 sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively 937 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively 938 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively 939 940 plane_0 = _mm_abs_epi32(plane_0); //Absolute values 941 plane_1 = _mm_abs_epi32(plane_1); 942 943 temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0] 944 temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0] 945 946 temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor 947 temp1 = _mm_add_epi32(temp1,rnd_fact); 948 949 temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works 950 temp1 = _mm_srli_epi32(temp1,u4_qbits); 951 952 temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. 953 temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration 954 955 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); 956 957 cmp = _mm_cmpeq_epi16(temp0, zero_8x16b); 958 mask = _mm_movemask_epi8(cmp); 959 mask0 = mask & 0xff; 960 mask1 = mask>>8; 961 if(mask0) 962 { 963 if(mask0 == 0xff) 964 nonzero_coeff_0 += 4; 965 else 966 { 967 cmp0 = _mm_and_si128(temp_1, cmp); 968 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 969 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 970 val = _mm_cvtsi128_si32(sum1); 971 val = val & 0xffff; 972 nonzero_coeff_0 += val; 973 } 974 } 975 if(mask1) 976 { 977 if(mask1 == 0xff) 978 nonzero_coeff_1 += 4; 979 else 980 { 981 cmp1 = _mm_srli_si128(cmp, 8); 982 cmp1 = _mm_and_si128(temp_1, cmp1); 983 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 984 sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 985 nonzero_coeff_1 += _mm_cvtsi128_si32(sum1); 986 } 987 } 988 989 pu1_nnz[0] = 4 - nonzero_coeff_0; 990 pu1_nnz[1] = 4 - nonzero_coeff_1; 991 992 } 993