1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /*****************************************************************************/ 21 /* */ 22 /* File Name : ih264_inter_pred_filters_intr_ssse3.c */ 23 /* */ 24 /* Description : Contains function definitions for weighted */ 25 /* prediction functions in x86 sse4 intrinsics */ 26 /* */ 27 /* List of Functions : ih264_inter_pred_luma_copy_ssse3() */ 28 /* ih264_inter_pred_luma_horz_ssse3() */ 29 /* ih264_inter_pred_luma_vert_ssse3() */ 30 /* ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3() */ 31 /* ih264_inter_pred_luma_horz_qpel_ssse3() */ 32 /* ih264_inter_pred_luma_vert_qpel_ssse3() */ 33 /* ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3() */ 34 /* ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3() */ 35 /* ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3() */ 36 /* ih264_inter_pred_chroma_ssse3() */ 37 /* */ 38 /* Issues / Problems : None */ 39 /* */ 40 /* Revision History : */ 41 /* */ 42 /* DD MM YYYY Author(s) Changes */ 43 /* 13 02 2015 Kaushik Initial version */ 44 /* Senthoor */ 45 /* */ 46 /*****************************************************************************/ 47 /*****************************************************************************/ 48 /* File Includes */ 49 /*****************************************************************************/ 50 51 #include <immintrin.h> 52 #include "ih264_typedefs.h" 53 #include "ih264_macros.h" 54 #include "ih264_platform_macros.h" 55 #include "ih264_inter_pred_filters.h" 56 57 /*****************************************************************************/ 58 /* Constant Data variables */ 59 /*****************************************************************************/ 60 61 /* coefficients for 6 tap filtering*/ 62 //const WORD32 ih264_g_six_tap[3] ={1,-5,20}; 63 /*****************************************************************************/ 64 /* Function definitions . */ 65 /*****************************************************************************/ 66 /*****************************************************************************/ 67 /* */ 68 /* Function Name : ih264_inter_pred_luma_copy_ssse3 */ 69 /* */ 70 /* Description : This function copies the contents of ht x wd block from */ 71 /* source to destination. (ht,wd) can be (4,4), (8,4), */ 72 /* (4,8), (8,8), (16,8), (8,16) or (16,16). */ 73 /* */ 74 /* Inputs : puc_src - pointer to source */ 75 /* puc_dst - pointer to destination */ 76 /* src_strd - stride for source */ 77 /* dst_strd - stride for destination */ 78 /* ht - height of the block */ 79 /* wd - width of the block */ 80 /* */ 81 /* Issues : None */ 82 /* */ 83 /* Revision History: */ 84 /* */ 85 /* DD MM YYYY Author(s) Changes */ 86 /* 13 02 2015 Kaushik Initial Version */ 87 /* Senthoor */ 88 /* */ 89 /*****************************************************************************/ 90 void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src, 91 UWORD8 *pu1_dst, 92 WORD32 src_strd, 93 WORD32 dst_strd, 94 WORD32 ht, 95 WORD32 wd, 96 UWORD8* pu1_tmp, 97 WORD32 dydx) 98 { 99 __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; 100 101 WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4; 102 UNUSED(pu1_tmp); 103 UNUSED(dydx); 104 105 src_strd2 = src_strd << 1; 106 dst_strd2 = dst_strd << 1; 107 src_strd4 = src_strd << 2; 108 dst_strd4 = dst_strd << 2; 109 src_strd3 = src_strd2 + src_strd; 110 dst_strd3 = dst_strd2 + dst_strd; 111 112 if(wd == 4) 113 { 114 do 115 { 116 *((WORD32 *)(pu1_dst)) = *((WORD32 *)(pu1_src)); 117 *((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd)); 118 *((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2)); 119 *((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3)); 120 121 ht -= 4; 122 pu1_src += src_strd4; 123 pu1_dst += dst_strd4; 124 } 125 while(ht > 0); 126 } 127 else if(wd == 8) 128 { 129 do 130 { 131 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 132 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 133 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2)); 134 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3)); 135 136 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); 137 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 138 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b); 139 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b); 140 141 ht -= 4; 142 pu1_src += src_strd4; 143 pu1_dst += dst_strd4; 144 } 145 while(ht > 0); 146 } 147 else // wd == 16 148 { 149 WORD32 src_strd5, src_strd6, src_strd7, src_strd8; 150 WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8; 151 152 __m128i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b; 153 154 src_strd5 = src_strd2 + src_strd3; 155 dst_strd5 = dst_strd2 + dst_strd3; 156 src_strd6 = src_strd3 << 1; 157 dst_strd6 = dst_strd3 << 1; 158 src_strd7 = src_strd3 + src_strd4; 159 dst_strd7 = dst_strd3 + dst_strd4; 160 src_strd8 = src_strd << 3; 161 dst_strd8 = dst_strd << 3; 162 163 do 164 { 165 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 166 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 167 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd2)); 168 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd3)); 169 y_4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd4)); 170 y_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd5)); 171 y_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd6)); 172 y_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd7)); 173 174 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); 175 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 176 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b); 177 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b); 178 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd4), y_4_16x8b); 179 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd5), y_5_16x8b); 180 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd6), y_6_16x8b); 181 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd7), y_7_16x8b); 182 183 ht -= 8; 184 pu1_src += src_strd8; 185 pu1_dst += dst_strd8; 186 } 187 while(ht > 0); 188 } 189 } 190 191 /*****************************************************************************/ 192 /* */ 193 /* Function Name : ih264_inter_pred_luma_horz_ssse3 */ 194 /* */ 195 /* Description : This function applies a horizontal 6-tap filter on */ 196 /* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */ 197 /* "Luma sample interpolation process". (ht,wd) can be */ 198 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 199 /* */ 200 /* Inputs : puc_src - pointer to source */ 201 /* puc_dst - pointer to destination */ 202 /* src_strd - stride for source */ 203 /* dst_strd - stride for destination */ 204 /* ht - height of the block */ 205 /* wd - width of the block */ 206 /* */ 207 /* Issues : None */ 208 /* */ 209 /* Revision History: */ 210 /* */ 211 /* DD MM YYYY Author(s) Changes */ 212 /* 13 02 2015 Kaushik Initial Version */ 213 /* Senthoor */ 214 /* */ 215 /*****************************************************************************/ 216 void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src, 217 UWORD8 *pu1_dst, 218 WORD32 src_strd, 219 WORD32 dst_strd, 220 WORD32 ht, 221 WORD32 wd, 222 UWORD8* pu1_tmp, 223 WORD32 dydx) 224 { 225 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 226 __m128i const_val16_8x16b; 227 228 UNUSED(pu1_tmp); 229 UNUSED(dydx); 230 231 pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) 232 233 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 234 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 235 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 236 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 237 const_val16_8x16b = _mm_set1_epi16(16); 238 239 if(wd == 4) 240 { 241 __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_16x8b; 242 __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; 243 244 __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; 245 __m128i res_r0r1_16x8b; 246 247 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 248 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 249 250 do 251 { 252 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 253 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 254 255 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 256 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 257 258 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 259 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 260 261 src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 262 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 263 //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 264 265 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 266 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 267 268 src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 269 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 270 //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 271 272 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 273 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 274 275 src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 276 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 277 //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 278 279 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); 280 res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b); 281 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16; 282 //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16; 283 //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16; 284 //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16; 285 //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16; 286 //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16; 287 //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16; 288 //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16; 289 290 res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits. 291 292 res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b); 293 294 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b); 295 res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); 296 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b); 297 298 ht -= 2; 299 pu1_src += src_strd << 1; 300 pu1_dst += dst_strd << 1; 301 } 302 while(ht > 0); 303 } 304 else if(wd == 8) 305 { 306 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 307 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 308 309 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 310 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 311 312 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 313 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 314 315 do 316 { 317 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 318 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 319 320 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 321 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 322 323 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 324 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 325 326 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 327 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 328 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 329 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 330 331 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 332 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 333 334 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 335 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 336 337 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 338 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 339 340 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 341 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 342 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 343 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 344 345 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 346 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 347 348 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 349 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 350 351 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 352 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 353 354 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 355 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 356 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 357 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 358 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 359 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 360 res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); 361 res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); 362 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 363 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 364 365 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. 366 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); 367 368 src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b); 369 src_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b); 370 371 _mm_storel_epi64((__m128i *)pu1_dst, src_r0_16x8b); 372 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_r1_16x8b); 373 374 ht -= 2; 375 pu1_src += src_strd << 1; 376 pu1_dst += dst_strd << 1; 377 } 378 while(ht > 0); 379 } 380 else // wd == 16 381 { 382 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 383 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 384 385 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 386 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 387 388 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 389 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 390 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. 391 392 do 393 { 394 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 395 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 396 397 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 398 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 399 400 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 401 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 402 403 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 404 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 405 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 406 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 407 408 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 409 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 410 411 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 412 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 413 414 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 415 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 416 417 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 418 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 419 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 420 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 421 422 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 423 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 424 425 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 426 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 427 428 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 429 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 430 431 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 432 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 433 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 434 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 435 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 436 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 437 res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); 438 res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); 439 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 440 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 441 442 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. 443 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); 444 445 src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); 446 _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b); 447 448 ht--; 449 pu1_src += src_strd; 450 pu1_dst += dst_strd; 451 } 452 while(ht > 0); 453 } 454 } 455 456 /*****************************************************************************/ 457 /* */ 458 /* Function Name : ih264_inter_pred_luma_vert_ssse3 */ 459 /* */ 460 /* Description : This function applies a vertical 6-tap filter on */ 461 /* ht x wd block as mentioned in sec. 8.4.2.2.1 titled */ 462 /* "Luma sample interpolation process". (ht,wd) can be */ 463 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 464 /* */ 465 /* Inputs : puc_src - pointer to source */ 466 /* puc_dst - pointer to destination */ 467 /* src_strd - stride for source */ 468 /* dst_strd - stride for destination */ 469 /* ht - height of the block */ 470 /* wd - width of the block */ 471 /* */ 472 /* Issues : None */ 473 /* */ 474 /* Revision History: */ 475 /* */ 476 /* DD MM YYYY Author(s) Changes */ 477 /* 13 02 2015 Kaushik Initial Version */ 478 /* Senthoor */ 479 /* */ 480 /*****************************************************************************/ 481 void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src, 482 UWORD8 *pu1_dst, 483 WORD32 src_strd, 484 WORD32 dst_strd, 485 WORD32 ht, 486 WORD32 wd, 487 UWORD8* pu1_tmp, 488 WORD32 dydx) 489 { 490 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; 491 __m128i src_r5_16x8b, src_r6_16x8b; 492 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 493 494 __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 495 496 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 497 __m128i const_val16_8x16b; 498 499 UNUSED(pu1_tmp); 500 UNUSED(dydx); 501 502 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 503 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 504 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 505 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 506 const_val16_8x16b = _mm_set1_epi16(16); 507 508 pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) 509 510 if(wd == 4) 511 { 512 //Epilogue: Load all the pred rows except sixth and seventh row 513 // for the first and second row processing. 514 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 515 pu1_src += src_strd; 516 src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 517 pu1_src += src_strd; 518 src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 519 pu1_src += src_strd; 520 src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 521 pu1_src += src_strd; 522 src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 523 pu1_src += src_strd; 524 525 src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); 526 src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); 527 src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); 528 src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); 529 530 do 531 { 532 src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 533 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 534 535 src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); 536 src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); 537 538 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 539 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 540 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 541 542 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 543 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 544 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 545 546 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 547 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 548 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 549 550 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 551 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 552 553 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); 554 res_16x8b = _mm_srli_si128(res_16x8b, 4); 555 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); 556 557 src_r0_16x8b = src_r2_16x8b; 558 src_r1_16x8b = src_r3_16x8b; 559 src_r2_16x8b = src_r4_16x8b; 560 src_r3_16x8b = src_r5_16x8b; 561 src_r4_16x8b = src_r6_16x8b; 562 563 ht -= 2; 564 pu1_src += src_strd << 1; 565 pu1_dst += dst_strd << 1; 566 } 567 while(ht > 0); 568 } 569 570 else if(wd == 8) 571 { 572 //Epilogue: Load all the pred rows except sixth and seventh row 573 // for the first and second row processing. 574 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 575 pu1_src += src_strd; 576 src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 577 pu1_src += src_strd; 578 src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 579 pu1_src += src_strd; 580 src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 581 pu1_src += src_strd; 582 src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 583 pu1_src += src_strd; 584 585 src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); 586 src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); 587 src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); 588 src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); 589 590 do 591 { 592 src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 593 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 594 595 src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); 596 src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); 597 598 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 599 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 600 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 601 602 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 603 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 604 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 605 606 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 607 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 608 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 609 610 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 611 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 612 613 _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); 614 615 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 616 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 617 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 618 619 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 620 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 621 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 622 623 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 624 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 625 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 626 627 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 628 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 629 630 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 631 632 src_r0_16x8b = src_r2_16x8b; 633 src_r1_16x8b = src_r3_16x8b; 634 src_r2_16x8b = src_r4_16x8b; 635 src_r3_16x8b = src_r5_16x8b; 636 src_r4_16x8b = src_r6_16x8b; 637 638 ht -= 2; 639 pu1_src += src_strd << 1; 640 pu1_dst += dst_strd << 1; 641 } 642 while(ht > 0); 643 } 644 else // wd == 16 645 { 646 __m128i res_t0_8x16b; 647 648 //Epilogue: Load all the pred rows except sixth and seventh row 649 // for the first and second row processing. 650 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 651 pu1_src += src_strd; 652 src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 653 pu1_src += src_strd; 654 src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 655 pu1_src += src_strd; 656 src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 657 pu1_src += src_strd; 658 src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 659 pu1_src += src_strd; 660 661 do 662 { 663 src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 664 src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 665 666 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 667 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 668 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 669 670 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 671 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 672 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 673 674 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 675 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 676 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 677 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 678 679 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 680 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 681 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 682 683 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 684 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 685 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 686 687 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 688 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 689 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 690 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 691 692 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); 693 694 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 695 696 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); 697 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); 698 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); 699 700 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 701 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 702 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 703 704 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 705 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 706 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 707 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 708 709 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); 710 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); 711 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); 712 713 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 714 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 715 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 716 717 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 718 res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b); 719 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 720 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 721 722 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); 723 724 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b); 725 726 src_r0_16x8b = src_r2_16x8b; 727 src_r1_16x8b = src_r3_16x8b; 728 src_r2_16x8b = src_r4_16x8b; 729 src_r3_16x8b = src_r5_16x8b; 730 src_r4_16x8b = src_r6_16x8b; 731 732 ht -= 2; 733 pu1_src += src_strd << 1; 734 pu1_dst += dst_strd << 1; 735 } 736 while(ht > 0); 737 } 738 } 739 740 /*****************************************************************************/ 741 /* */ 742 /* Function Name : ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3 */ 743 /* */ 744 /* Description : This function implements a two stage cascaded six tap */ 745 /* filter, horizontally and then vertically on ht x wd */ 746 /* block as mentioned in sec. 8.4.2.2.1 titled "Luma sample */ 747 /* interpolation process". (ht,wd) can be (4,4), (8,4), */ 748 /* (4,8), (8,8), (16,8), (8,16) or (16,16). */ 749 /* */ 750 /* Inputs : puc_src - pointer to source */ 751 /* puc_dst - pointer to destination */ 752 /* src_strd - stride for source */ 753 /* dst_strd - stride for destination */ 754 /* ht - height of the block */ 755 /* wd - width of the block */ 756 /* pu1_tmp - pointer to temporary buffer */ 757 /* */ 758 /* Issues : None */ 759 /* */ 760 /* Revision History: */ 761 /* */ 762 /* DD MM YYYY Author(s) Changes */ 763 /* 13 02 2015 Kaushik Initial Version */ 764 /* Senthoor */ 765 /* */ 766 /*****************************************************************************/ 767 void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src, 768 UWORD8 *pu1_dst, 769 WORD32 src_strd, 770 WORD32 dst_strd, 771 WORD32 ht, 772 WORD32 wd, 773 UWORD8* pu1_tmp, 774 WORD32 dydx) 775 { 776 UNUSED(dydx); 777 778 if(wd == 4) 779 { 780 WORD16 *pi2_temp; 781 782 pu1_tmp += 4; 783 pu1_src -= src_strd << 1; 784 pi2_temp = (WORD16 *)pu1_tmp; 785 pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) 786 787 // Horizontal 6-tap filtering 788 { 789 WORD32 ht_tmp = ht + 4; 790 791 __m128i src_r0_16x8b, src_r1_16x8b; 792 __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; 793 __m128i src_r0r1_t1_16x8b; 794 __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; 795 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 796 797 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 798 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 799 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 800 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 801 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 802 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 803 804 do 805 { 806 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 807 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 808 809 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 810 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 811 812 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 813 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 814 815 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 816 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 817 //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 818 819 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 820 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 821 822 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 823 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 824 //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 825 826 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 827 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 828 829 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 830 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 831 //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 832 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); 833 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b); 834 835 _mm_storeu_si128((__m128i *)pi2_temp, res_r0r1_t1_8x16b); 836 837 ht_tmp -= 2; 838 pu1_src += src_strd << 1; 839 pi2_temp += 8; 840 } 841 while(ht_tmp > 0); 842 843 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 844 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 845 846 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 847 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 848 849 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 850 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 851 852 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 853 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 854 855 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); 856 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b); 857 858 _mm_storel_epi64((__m128i *)pi2_temp, res_r0r1_t1_8x16b); 859 } 860 861 pi2_temp = (WORD16 *)pu1_tmp; 862 863 // Vertical 6-tap filtering 864 { 865 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, 866 src_r4_8x16b; 867 __m128i src_r5_8x16b, src_r6_8x16b; 868 __m128i src_t1_8x16b, src_t2_8x16b; 869 870 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 871 __m128i res_8x16b, res_16x8b; 872 873 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 874 __m128i const_val512_4x32b; 875 876 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 877 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 878 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 879 880 const_val512_4x32b = _mm_set1_epi32(512); 881 882 src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp)); 883 src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4)); 884 src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 8)); 885 src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 12)); 886 src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 16)); 887 pi2_temp += 20; 888 889 do 890 { 891 src_r5_8x16b = _mm_loadl_epi64((__m128i *)pi2_temp); 892 src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4)); 893 894 src_r0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 895 src_t1_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 896 src_t2_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 897 898 res_t1_4x32b = _mm_madd_epi16(src_r0_8x16b, coeff0_1_8x16b); 899 res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b); 900 res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b); 901 902 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 903 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 904 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 905 906 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 907 908 src_r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 909 src_t1_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 910 src_t2_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 911 912 res_t1_4x32b = _mm_madd_epi16(src_r1_8x16b, coeff0_1_8x16b); 913 res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b); 914 res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b); 915 916 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 917 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 918 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 919 920 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 921 922 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 923 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 924 925 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); 926 res_16x8b = _mm_srli_si128(res_16x8b, 4); 927 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); 928 929 src_r0_8x16b = src_r2_8x16b; 930 src_r1_8x16b = src_r3_8x16b; 931 src_r2_8x16b = src_r4_8x16b; 932 src_r3_8x16b = src_r5_8x16b; 933 src_r4_8x16b = src_r6_8x16b; 934 935 ht -= 2; 936 pi2_temp += 8; 937 pu1_dst += dst_strd << 1; 938 } 939 while(ht > 0); 940 } 941 } 942 else if(wd == 8) 943 { 944 WORD16 *pi2_temp; 945 946 pu1_tmp += 4; 947 pu1_src -= src_strd << 1; 948 pi2_temp = (WORD16 *)pu1_tmp; 949 pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) 950 951 // Horizontal 6-tap filtering 952 { 953 WORD32 ht_tmp = ht + 4; 954 955 __m128i src_r0_16x8b, src_r1_16x8b; 956 __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; 957 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 958 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 959 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 960 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 961 962 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 963 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 964 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 965 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 966 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 967 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 968 969 do 970 { 971 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 972 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 973 974 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 975 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 976 977 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 978 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 979 980 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 981 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 982 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 983 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 984 985 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 986 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 987 988 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 989 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 990 991 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 992 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 993 994 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 995 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 996 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 997 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 998 999 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 1000 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 1001 1002 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 1003 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 1004 1005 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 1006 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 1007 1008 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 1009 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 1010 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 1011 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 1012 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 1013 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 1014 1015 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 1016 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 1017 1018 _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b); 1019 _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b); 1020 1021 ht_tmp -= 2; 1022 pu1_src += src_strd << 1; 1023 pi2_temp += 16; 1024 } 1025 while(ht_tmp > 0); 1026 1027 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 1028 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 1029 1030 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b,src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 1031 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b,coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 1032 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 1033 1034 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 1035 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 1036 1037 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 1038 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 1039 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 1040 1041 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 1042 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 1043 1044 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 1045 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 1046 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 1047 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 1048 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 1049 1050 _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b); 1051 } 1052 1053 pi2_temp = (WORD16 *)pu1_tmp; 1054 1055 // Vertical 6-tap filtering 1056 { 1057 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, 1058 src_r4_8x16b; 1059 __m128i src_r5_8x16b, src_r6_8x16b; 1060 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; 1061 1062 __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 1063 __m128i res_c0_4x32b, res_c1_4x32b; 1064 __m128i res_8x16b, res_16x8b; 1065 1066 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 1067 __m128i const_val512_4x32b; 1068 1069 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 1070 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 1071 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 1072 1073 const_val512_4x32b = _mm_set1_epi32(512); 1074 1075 src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); 1076 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8)); 1077 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16)); 1078 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 24)); 1079 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32)); 1080 pi2_temp += 40; 1081 1082 do 1083 { 1084 src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); 1085 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8)); 1086 1087 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 1088 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 1089 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 1090 1091 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1092 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1093 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1094 1095 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1096 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1097 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1098 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1099 1100 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 1101 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 1102 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 1103 1104 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1105 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1106 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1107 1108 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1109 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1110 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1111 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1112 1113 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); 1114 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 1115 1116 _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); 1117 1118 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 1119 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 1120 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 1121 1122 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1123 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1124 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1125 1126 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1127 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1128 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1129 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1130 1131 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); 1132 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); 1133 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); 1134 1135 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1136 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1137 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1138 1139 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1140 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1141 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1142 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1143 1144 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); 1145 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 1146 1147 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 1148 1149 src_r0_8x16b = src_r2_8x16b; 1150 src_r1_8x16b = src_r3_8x16b; 1151 src_r2_8x16b = src_r4_8x16b; 1152 src_r3_8x16b = src_r5_8x16b; 1153 src_r4_8x16b = src_r6_8x16b; 1154 1155 ht -= 2; 1156 pi2_temp += 16; 1157 pu1_dst += dst_strd << 1; 1158 } 1159 while(ht > 0); 1160 } 1161 } 1162 else // wd == 16 1163 { 1164 WORD16 *pi2_temp; 1165 WORD32 ht_tmp; 1166 1167 pu1_tmp += 4; 1168 pu1_src -= src_strd << 1; 1169 pi2_temp = (WORD16 *)pu1_tmp; 1170 pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) 1171 1172 // Horizontal 6-tap filtering 1173 { 1174 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 1175 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 1176 1177 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 1178 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 1179 1180 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 1181 1182 ht_tmp = ht + 5; 1183 1184 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 1185 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 1186 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 1187 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 1188 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 1189 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 1190 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. 1191 1192 do 1193 { 1194 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 1195 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 1196 1197 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 1198 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 1199 1200 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 1201 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 1202 1203 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 1204 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 1205 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 1206 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 1207 1208 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 1209 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 1210 1211 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 1212 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 1213 1214 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 1215 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 1216 1217 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 1218 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 1219 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 1220 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 1221 1222 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 1223 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 1224 1225 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 1226 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 1227 1228 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 1229 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 1230 1231 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 1232 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 1233 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 1234 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 1235 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 1236 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 1237 1238 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 1239 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 1240 1241 _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b); 1242 _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b); 1243 1244 ht_tmp--; 1245 pu1_src += src_strd; 1246 pi2_temp += 16; 1247 } 1248 while(ht_tmp > 0); 1249 } 1250 1251 pi2_temp = (WORD16 *)pu1_tmp; 1252 1253 // Vertical 6-tap filtering 1254 { 1255 WORD16 *pi2_temp2; 1256 UWORD8 *pu1_dst2; 1257 WORD32 ht_tmp; 1258 1259 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b; 1260 __m128i src_r5_8x16b, src_r6_8x16b; 1261 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; 1262 1263 __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 1264 __m128i res_c0_4x32b, res_c1_4x32b; 1265 __m128i res_8x16b, res_16x8b; 1266 1267 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 1268 __m128i const_val512_4x32b; 1269 1270 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 1271 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 1272 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 1273 1274 const_val512_4x32b = _mm_set1_epi32(512); 1275 1276 pi2_temp2 = pi2_temp + 8; 1277 pu1_dst2 = pu1_dst + 8; 1278 ht_tmp = ht; 1279 1280 /**********************************************************/ 1281 /* Do first height x 8 block */ 1282 /**********************************************************/ 1283 src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); 1284 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16)); 1285 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32)); 1286 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 48)); 1287 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 64)); 1288 pi2_temp += 80; 1289 1290 do 1291 { 1292 src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp); 1293 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16)); 1294 1295 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 1296 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 1297 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 1298 1299 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1300 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1301 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1302 1303 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1304 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1305 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1306 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1307 1308 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 1309 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 1310 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 1311 1312 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1313 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1314 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1315 1316 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1317 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1318 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1319 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1320 1321 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); 1322 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 1323 1324 _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); 1325 1326 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 1327 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 1328 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 1329 1330 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1331 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1332 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1333 1334 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1335 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1336 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1337 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1338 1339 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); 1340 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); 1341 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); 1342 1343 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1344 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1345 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1346 1347 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1348 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1349 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1350 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1351 1352 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); 1353 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 1354 1355 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 1356 1357 src_r0_8x16b = src_r2_8x16b; 1358 src_r1_8x16b = src_r3_8x16b; 1359 src_r2_8x16b = src_r4_8x16b; 1360 src_r3_8x16b = src_r5_8x16b; 1361 src_r4_8x16b = src_r6_8x16b; 1362 1363 ht_tmp -= 2; 1364 pi2_temp += 32; 1365 pu1_dst += dst_strd << 1; 1366 } 1367 while(ht_tmp > 0); 1368 1369 /**********************************************************/ 1370 /* Do second ht x 8 block */ 1371 /**********************************************************/ 1372 src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2); 1373 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); 1374 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32)); 1375 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48)); 1376 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64)); 1377 pi2_temp2 += 80; 1378 1379 do 1380 { 1381 src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2); 1382 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); 1383 1384 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 1385 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 1386 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 1387 1388 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1389 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1390 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1391 1392 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1393 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1394 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1395 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1396 1397 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 1398 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 1399 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 1400 1401 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1402 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1403 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1404 1405 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1406 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1407 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1408 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1409 1410 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); 1411 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 1412 1413 _mm_storel_epi64((__m128i *)pu1_dst2, res_16x8b); 1414 1415 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 1416 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 1417 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 1418 1419 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1420 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1421 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1422 1423 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1424 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1425 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1426 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1427 1428 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); 1429 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); 1430 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); 1431 1432 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 1433 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 1434 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 1435 1436 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 1437 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 1438 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 1439 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 1440 1441 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b); 1442 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 1443 1444 _mm_storel_epi64((__m128i *)(pu1_dst2 + dst_strd), res_16x8b); 1445 1446 src_r0_8x16b = src_r2_8x16b; 1447 src_r1_8x16b = src_r3_8x16b; 1448 src_r2_8x16b = src_r4_8x16b; 1449 src_r3_8x16b = src_r5_8x16b; 1450 src_r4_8x16b = src_r6_8x16b; 1451 1452 ht -= 2; 1453 pi2_temp2 += 32; 1454 pu1_dst2 += dst_strd << 1; 1455 } 1456 while(ht > 0); 1457 } 1458 } 1459 } 1460 1461 /*****************************************************************************/ 1462 /* */ 1463 /* Function Name : ih264_inter_pred_luma_horz_qpel_ssse3 */ 1464 /* */ 1465 /* Description : This function implements a six-tap filter horizontally */ 1466 /* on ht x wd block and averages the values with the source */ 1467 /* pixels to calculate horizontal quarter-pel as mentioned */ 1468 /* in sec. 8.4.2.2.1 titled "Luma sample interpolation */ 1469 /* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ 1470 /* (16,8), (8,16) or (16,16). */ 1471 /* */ 1472 /* Inputs : puc_src - pointer to source */ 1473 /* puc_dst - pointer to destination */ 1474 /* src_strd - stride for source */ 1475 /* dst_strd - stride for destination */ 1476 /* ht - height of the block */ 1477 /* wd - width of the block */ 1478 /* pu1_tmp - pointer to temporary buffer */ 1479 /* dydx - x and y reference offset for q-pel */ 1480 /* calculations */ 1481 /* */ 1482 /* Issues : None */ 1483 /* */ 1484 /* Revision History: */ 1485 /* */ 1486 /* DD MM YYYY Author(s) Changes */ 1487 /* 13 02 2015 Kaushik Initial Version */ 1488 /* Senthoor */ 1489 /* */ 1490 /*****************************************************************************/ 1491 void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src, 1492 UWORD8 *pu1_dst, 1493 WORD32 src_strd, 1494 WORD32 dst_strd, 1495 WORD32 ht, 1496 WORD32 wd, 1497 UWORD8* pu1_tmp, 1498 WORD32 dydx) 1499 { 1500 WORD32 x_offset; 1501 UWORD8 *pu1_pred1; 1502 1503 __m128i src_r0_16x8b, src_r1_16x8b; 1504 __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; 1505 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 1506 __m128i const_val16_8x16b; 1507 1508 UNUSED(pu1_tmp); 1509 1510 x_offset = dydx & 3; 1511 1512 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 1513 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 1514 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 1515 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 1516 pu1_pred1 = pu1_src + (x_offset >> 1); 1517 1518 const_val16_8x16b = _mm_set1_epi16(16); 1519 1520 pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) 1521 1522 if(wd == 4) 1523 { 1524 __m128i src_r0r1_16x8b; 1525 1526 __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; 1527 __m128i res_r0r1_16x8b; 1528 1529 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 1530 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 1531 1532 do 1533 { 1534 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 1535 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 1536 1537 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 1538 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 1539 1540 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 1541 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 1542 1543 src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 1544 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 1545 //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 1546 1547 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 1548 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 1549 1550 src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 1551 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 1552 //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 1553 1554 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 1555 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 1556 1557 src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 1558 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 1559 //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 1560 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); 1561 src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); 1562 1563 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); 1564 res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b); 1565 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16; 1566 //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16; 1567 //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16; 1568 //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16; 1569 //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16; 1570 //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16; 1571 //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16; 1572 //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16; 1573 src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b); 1574 1575 res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits. 1576 1577 res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b); 1578 res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b); //computing q-pel 1579 1580 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b); 1581 res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); 1582 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b); 1583 1584 ht -= 2; 1585 pu1_src += src_strd << 1; 1586 pu1_pred1 += src_strd << 1; 1587 pu1_dst += dst_strd << 1; 1588 } 1589 while(ht > 0); 1590 } 1591 else if(wd == 8) 1592 { 1593 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 1594 1595 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 1596 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 1597 __m128i res_r0_16x8b, res_r1_16x8b; 1598 1599 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 1600 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 1601 1602 do 1603 { 1604 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 1605 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 1606 1607 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 1608 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 1609 1610 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 1611 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 1612 1613 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 1614 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 1615 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 1616 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 1617 1618 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 1619 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 1620 1621 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 1622 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 1623 1624 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 1625 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 1626 1627 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 1628 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 1629 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 1630 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 1631 1632 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 1633 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 1634 1635 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 1636 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 1637 1638 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 1639 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 1640 1641 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 1642 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 1643 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 1644 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 1645 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); 1646 src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); 1647 1648 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 1649 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 1650 res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); 1651 res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b); 1652 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 1653 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 1654 1655 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); 1656 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits. 1657 1658 res_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b); 1659 res_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b); 1660 1661 res_r0_16x8b = _mm_avg_epu8(src_r0_16x8b, res_r0_16x8b); 1662 res_r1_16x8b = _mm_avg_epu8(src_r1_16x8b, res_r1_16x8b); //computing q-pel 1663 1664 _mm_storel_epi64((__m128i *)pu1_dst, res_r0_16x8b); 1665 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_r1_16x8b); 1666 1667 ht -= 2; 1668 pu1_src += src_strd << 1; 1669 pu1_pred1 += src_strd << 1; 1670 pu1_dst += dst_strd << 1; 1671 } 1672 while(ht > 0); 1673 } 1674 else // wd == 16 1675 { 1676 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 1677 1678 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 1679 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 1680 __m128i res_16x8b; 1681 1682 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 1683 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 1684 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. 1685 1686 do 1687 { 1688 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 1689 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 1690 1691 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 1692 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 1693 1694 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 1695 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 1696 1697 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 1698 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 1699 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 1700 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 1701 1702 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 1703 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 1704 1705 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 1706 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 1707 1708 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 1709 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 1710 1711 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 1712 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 1713 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 1714 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 1715 1716 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 1717 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 1718 1719 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 1720 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 1721 1722 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 1723 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 1724 1725 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 1726 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 1727 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 1728 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 1729 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1); 1730 1731 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 1732 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 1733 res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); 1734 res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b); 1735 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 1736 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 1737 1738 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); 1739 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits 1740 1741 res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); 1742 res_16x8b = _mm_avg_epu8(src_r0_16x8b, res_16x8b); //computing q-pel 1743 1744 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 1745 1746 ht--; 1747 pu1_src += src_strd; 1748 pu1_pred1 += src_strd; 1749 pu1_dst += dst_strd; 1750 } 1751 while(ht > 0); 1752 } 1753 } 1754 1755 /*****************************************************************************/ 1756 /* */ 1757 /* Function Name : ih264_inter_pred_luma_vert_qpel_ssse3 */ 1758 /* */ 1759 /* Description : This function implements a six-tap filter vertically on */ 1760 /* ht x wd block and averages the values with the source */ 1761 /* pixels to calculate vertical quarter-pel as mentioned in */ 1762 /* sec. 8.4.2.2.1 titled "Luma sample interpolation */ 1763 /* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ 1764 /* (16,8), (8,16) or (16,16). */ 1765 /* */ 1766 /* Inputs : puc_src - pointer to source */ 1767 /* puc_dst - pointer to destination */ 1768 /* src_strd - stride for source */ 1769 /* dst_strd - stride for destination */ 1770 /* ht - height of the block */ 1771 /* wd - width of the block */ 1772 /* pu1_tmp - pointer to temporary buffer */ 1773 /* dydx - x and y reference offset for q-pel */ 1774 /* calculations */ 1775 /* */ 1776 /* Issues : None */ 1777 /* */ 1778 /* Revision History: */ 1779 /* */ 1780 /* DD MM YYYY Author(s) Changes */ 1781 /* 13 02 2015 Kaushik Initial Version */ 1782 /* Senthoor */ 1783 /* */ 1784 /*****************************************************************************/ 1785 void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, 1786 UWORD8 *pu1_dst, 1787 WORD32 src_strd, 1788 WORD32 dst_strd, 1789 WORD32 ht, 1790 WORD32 wd, 1791 UWORD8* pu1_tmp, 1792 WORD32 dydx) 1793 { 1794 WORD32 y_offset; 1795 UWORD8 *pu1_pred1; 1796 1797 1798 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; 1799 __m128i src_r5_16x8b, src_r6_16x8b; 1800 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 1801 __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 1802 1803 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 1804 __m128i const_val16_8x16b; 1805 1806 UNUSED(pu1_tmp); 1807 y_offset = dydx & 0xf; 1808 1809 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 1810 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 1811 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 1812 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 1813 1814 pu1_pred1 = pu1_src + (y_offset >> 3) * src_strd; 1815 1816 const_val16_8x16b = _mm_set1_epi16(16); 1817 1818 pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) 1819 1820 if(wd == 4) 1821 { 1822 //Epilogue: Load all the pred rows except sixth and seventh row 1823 // for the first and second row processing. 1824 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1825 pu1_src += src_strd; 1826 src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1827 pu1_src += src_strd; 1828 src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1829 pu1_src += src_strd; 1830 src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1831 pu1_src += src_strd; 1832 src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1833 pu1_src += src_strd; 1834 1835 src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); 1836 src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); 1837 src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); 1838 src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); 1839 1840 do 1841 { 1842 src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1843 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 1844 1845 src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); 1846 src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); 1847 1848 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 1849 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 1850 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 1851 1852 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 1853 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 1854 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 1855 1856 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); 1857 src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); 1858 1859 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 1860 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 1861 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 1862 1863 src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b); 1864 1865 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 1866 1867 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 1868 1869 res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel 1870 1871 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); 1872 res_16x8b = _mm_srli_si128(res_16x8b, 4); 1873 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); 1874 1875 src_r0_16x8b = src_r2_16x8b; 1876 src_r1_16x8b = src_r3_16x8b; 1877 src_r2_16x8b = src_r4_16x8b; 1878 src_r3_16x8b = src_r5_16x8b; 1879 src_r4_16x8b = src_r6_16x8b; 1880 1881 ht -= 2; 1882 pu1_src += src_strd << 1; 1883 pu1_pred1 += src_strd << 1; 1884 pu1_dst += dst_strd << 1; 1885 } 1886 while(ht > 0); 1887 } 1888 1889 else if(wd == 8) 1890 { 1891 //Epilogue: Load all the pred rows except sixth and seventh row 1892 // for the first and second row processing. 1893 src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1894 pu1_src += src_strd; 1895 src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1896 pu1_src += src_strd; 1897 src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1898 pu1_src += src_strd; 1899 src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1900 pu1_src += src_strd; 1901 src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1902 pu1_src += src_strd; 1903 1904 src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); 1905 src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); 1906 src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); 1907 src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); 1908 1909 do 1910 { 1911 src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 1912 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 1913 1914 src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); 1915 src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); 1916 1917 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 1918 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 1919 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 1920 1921 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 1922 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 1923 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 1924 1925 src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1); 1926 1927 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 1928 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 1929 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 1930 1931 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 1932 1933 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 1934 res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel 1935 1936 _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b); 1937 1938 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 1939 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 1940 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 1941 1942 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 1943 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 1944 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 1945 1946 src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd)); 1947 1948 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 1949 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 1950 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 1951 1952 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 1953 1954 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 1955 res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel 1956 1957 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 1958 1959 src_r0_16x8b = src_r2_16x8b; 1960 src_r1_16x8b = src_r3_16x8b; 1961 src_r2_16x8b = src_r4_16x8b; 1962 src_r3_16x8b = src_r5_16x8b; 1963 src_r4_16x8b = src_r6_16x8b; 1964 1965 ht -= 2; 1966 pu1_src += src_strd << 1; 1967 pu1_pred1 += src_strd << 1; 1968 pu1_dst += dst_strd << 1; 1969 } 1970 while(ht > 0); 1971 } 1972 else // wd == 16 1973 { 1974 __m128i res_t0_8x16b; 1975 1976 //Epilogue: Load all the pred rows except sixth and seventh row 1977 // for the first and second row processing. 1978 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 1979 pu1_src += src_strd; 1980 src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 1981 pu1_src += src_strd; 1982 src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 1983 pu1_src += src_strd; 1984 src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 1985 pu1_src += src_strd; 1986 src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 1987 pu1_src += src_strd; 1988 1989 do 1990 { 1991 src_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 1992 src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 1993 1994 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 1995 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 1996 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 1997 1998 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 1999 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2000 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2001 2002 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2003 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2004 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2005 2006 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2007 2008 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 2009 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 2010 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 2011 2012 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2013 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2014 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2015 2016 src_r0r1_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1); 2017 2018 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2019 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2020 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2021 2022 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2023 2024 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); 2025 res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel 2026 2027 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 2028 2029 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); 2030 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); 2031 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); 2032 2033 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2034 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2035 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2036 2037 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2038 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2039 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2040 2041 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2042 2043 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); 2044 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); 2045 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); 2046 2047 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2048 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2049 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2050 2051 src_r0r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred1 + src_strd)); 2052 2053 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2054 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2055 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2056 2057 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2058 2059 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); 2060 res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel 2061 2062 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b); 2063 2064 src_r0_16x8b = src_r2_16x8b; 2065 src_r1_16x8b = src_r3_16x8b; 2066 src_r2_16x8b = src_r4_16x8b; 2067 src_r3_16x8b = src_r5_16x8b; 2068 src_r4_16x8b = src_r6_16x8b; 2069 2070 ht -= 2; 2071 pu1_src += src_strd << 1; 2072 pu1_pred1 += src_strd << 1; 2073 pu1_dst += dst_strd << 1; 2074 } 2075 while(ht > 0); 2076 } 2077 } 2078 2079 /*****************************************************************************/ 2080 /* */ 2081 /* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3 */ 2082 /* */ 2083 /* Description : This function implements a six-tap filter vertically and */ 2084 /* horizontally on ht x wd block separately and averages */ 2085 /* the two sets of values to calculate values at (1/4,1/4), */ 2086 /* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */ 2087 /* sec. 8.4.2.2.1 titled "Luma sample interpolation */ 2088 /* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ 2089 /* (16,8), (8,16) or (16,16). */ 2090 /* */ 2091 /* Inputs : puc_src - pointer to source */ 2092 /* puc_dst - pointer to destination */ 2093 /* src_strd - stride for source */ 2094 /* dst_strd - stride for destination */ 2095 /* ht - height of the block */ 2096 /* wd - width of the block */ 2097 /* pu1_tmp - pointer to temporary buffer */ 2098 /* dydx - x and y reference offset for q-pel */ 2099 /* calculations */ 2100 /* */ 2101 /* Issues : None */ 2102 /* */ 2103 /* Revision History: */ 2104 /* */ 2105 /* DD MM YYYY Author(s) Changes */ 2106 /* 13 02 2015 Kaushik Initial Version */ 2107 /* Senthoor */ 2108 /* */ 2109 /*****************************************************************************/ 2110 void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src, 2111 UWORD8 *pu1_dst, 2112 WORD32 src_strd, 2113 WORD32 dst_strd, 2114 WORD32 ht, 2115 WORD32 wd, 2116 UWORD8* pu1_tmp, 2117 WORD32 dydx) 2118 { 2119 WORD32 ht_temp; 2120 UWORD8 *pu1_pred_vert,*pu1_pred_horiz; 2121 UWORD8 *pu1_tmp1, *pu1_tmp2; 2122 WORD32 x_offset, y_offset; 2123 2124 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 2125 __m128i const_val16_8x16b; 2126 2127 pu1_tmp1 = pu1_tmp; 2128 2129 dydx &= 0xf; 2130 ht_temp = ht; 2131 x_offset = dydx & 0x3; 2132 y_offset = dydx >> 2; 2133 pu1_tmp2 = pu1_tmp1; 2134 2135 pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd; 2136 pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2; 2137 //the filter input starts from x[-2] (till x[3]) 2138 2139 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 2140 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 2141 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 2142 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 2143 const_val16_8x16b = _mm_set1_epi16(16); 2144 2145 if(wd == 4) 2146 { 2147 //vertical q-pel filter 2148 { 2149 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; 2150 __m128i src_r5_16x8b, src_r6_16x8b; 2151 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 2152 2153 __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 2154 2155 //epilogue: Load all the pred rows except sixth and seventh row for the 2156 //first and second row processing. 2157 src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2158 pu1_pred_vert = pu1_pred_vert + src_strd; 2159 2160 src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2161 pu1_pred_vert = pu1_pred_vert + src_strd; 2162 src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); 2163 2164 src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2165 pu1_pred_vert = pu1_pred_vert + src_strd; 2166 src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); 2167 2168 src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2169 pu1_pred_vert = pu1_pred_vert + src_strd; 2170 src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); 2171 2172 src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2173 pu1_pred_vert = pu1_pred_vert + src_strd; 2174 src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); 2175 2176 //Core Loop: Process all the rows. 2177 do 2178 { 2179 src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2180 src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); 2181 2182 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd)); 2183 src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); 2184 2185 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 2186 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 2187 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 2188 2189 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2190 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2191 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2192 2193 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2194 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2195 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2196 2197 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2198 res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 2199 2200 _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b); 2201 2202 src_r0_16x8b = src_r2_16x8b; 2203 src_r1_16x8b = src_r3_16x8b; 2204 src_r2_16x8b = src_r4_16x8b; 2205 src_r3_16x8b = src_r5_16x8b; 2206 src_r4_16x8b = src_r6_16x8b; 2207 2208 ht_temp -= 2; 2209 pu1_pred_vert += src_strd << 1; 2210 pu1_tmp1 += 8; 2211 } 2212 while(ht_temp > 0); 2213 } 2214 2215 //horizontal q-pel filter 2216 { 2217 __m128i src_r0_16x8b, src_r1_16x8b; 2218 __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; 2219 __m128i src_r0r1_vpel_16x8b, src_r0r1_t1_16x8b; 2220 2221 __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; 2222 __m128i res_r0r1_16x8b; 2223 2224 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 2225 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 2226 2227 do 2228 { 2229 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred_horiz); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 2230 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 2231 2232 src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2); 2233 2234 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 2235 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 2236 2237 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 2238 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 2239 2240 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 2241 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 2242 //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 2243 2244 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 2245 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 2246 2247 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 2248 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 2249 //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 2250 2251 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 2252 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 2253 2254 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 2255 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 2256 //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 2257 2258 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); 2259 res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b); 2260 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15; 2261 //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15; 2262 //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15; 2263 //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15; 2264 //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15; 2265 //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15; 2266 //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15; 2267 //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15; 2268 2269 res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5); //shifting right by 5 bits. 2270 2271 res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b,res_r0r1_t1_8x16b); 2272 2273 res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b); 2274 2275 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b); 2276 res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4); 2277 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b); 2278 2279 ht -= 2; 2280 pu1_pred_horiz += src_strd << 1; 2281 pu1_tmp2 += 8; 2282 pu1_dst += dst_strd << 1; 2283 } 2284 while(ht > 0); 2285 } 2286 } 2287 else if(wd == 8) 2288 { 2289 //vertical q-pel filter 2290 { 2291 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; 2292 __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b; 2293 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 2294 2295 __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 2296 2297 //epilogue: Load all the pred rows except sixth and seventh row for the 2298 //first and second row processing. 2299 src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2300 pu1_pred_vert = pu1_pred_vert + src_strd; 2301 2302 src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2303 pu1_pred_vert = pu1_pred_vert + src_strd; 2304 src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); 2305 2306 src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2307 pu1_pred_vert = pu1_pred_vert + src_strd; 2308 src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); 2309 2310 src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2311 pu1_pred_vert = pu1_pred_vert + src_strd; 2312 src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); 2313 2314 src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2315 pu1_pred_vert = pu1_pred_vert + src_strd; 2316 src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); 2317 2318 //Core Loop: Process all the rows. 2319 do 2320 { 2321 src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); 2322 src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); 2323 2324 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd)); 2325 src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); 2326 2327 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 2328 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 2329 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 2330 2331 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2332 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2333 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2334 2335 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2336 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2337 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2338 2339 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2340 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 2341 2342 _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b); 2343 2344 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 2345 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 2346 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 2347 2348 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2349 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2350 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2351 2352 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2353 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2354 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2355 2356 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2357 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); 2358 2359 _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b); 2360 2361 src_r0_16x8b = src_r2_16x8b; 2362 src_r1_16x8b = src_r3_16x8b; 2363 src_r2_16x8b = src_r4_16x8b; 2364 src_r3_16x8b = src_r5_16x8b; 2365 src_r4_16x8b = src_r6_16x8b; 2366 2367 ht_temp -= 2; 2368 pu1_pred_vert += src_strd << 1; 2369 pu1_tmp1 += 16; 2370 } 2371 while(ht_temp > 0); 2372 } 2373 2374 //horizontal q-pel filter 2375 { 2376 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 2377 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 2378 __m128i src_r0_vpel_16x8b, src_r1_vpel_16x8b; 2379 2380 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 2381 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b, res_16x8b; 2382 2383 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 2384 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 2385 2386 do 2387 { 2388 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 2389 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 2390 2391 src_r0_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2)); //a2 a3 a4 a5 a6 a7 a8....a15 0 or 2392 //a3 a4 a5 a6 a7 a8 a9....a15 0 2393 src_r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2 + 8)); 2394 //b2 b3 b4 b5 b6 b7 b8....b15 0 or 2395 //b3 b4 b5 b6 b7 b8 b9....b15 0 2396 2397 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 2398 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 2399 2400 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 2401 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 2402 2403 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 2404 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 2405 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 2406 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 2407 2408 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 2409 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 2410 2411 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 2412 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 2413 2414 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 2415 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 2416 2417 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 2418 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 2419 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 2420 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 2421 2422 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 2423 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 2424 2425 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 2426 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 2427 2428 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 2429 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 2430 2431 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 2432 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 2433 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 2434 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 2435 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 2436 res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); 2437 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 2438 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. 2439 2440 res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b); 2441 res_16x8b = _mm_avg_epu8(res_16x8b, src_r0_vpel_16x8b); 2442 2443 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); 2444 2445 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 2446 res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b); 2447 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 2448 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits. 2449 2450 res_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b); 2451 res_16x8b = _mm_avg_epu8(res_16x8b,src_r1_vpel_16x8b); 2452 2453 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 2454 2455 ht -= 2; 2456 pu1_pred_horiz += src_strd << 1; 2457 pu1_dst += dst_strd << 1; 2458 pu1_tmp2 += 16; 2459 } 2460 while(ht > 0); 2461 } 2462 } 2463 else // wd == 16 2464 { 2465 //vertical q-pel filter 2466 { 2467 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; 2468 __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b; 2469 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 2470 2471 __m128i res_t0_8x16b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 2472 __m128i res_16x8b; 2473 2474 //epilogue: Load all the pred rows except sixth and seventh row for the 2475 //first and second row processing. 2476 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); 2477 pu1_pred_vert = pu1_pred_vert + src_strd; 2478 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); 2479 pu1_pred_vert = pu1_pred_vert + src_strd; 2480 src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); 2481 pu1_pred_vert = pu1_pred_vert + src_strd; 2482 src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); 2483 pu1_pred_vert = pu1_pred_vert + src_strd; 2484 src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); 2485 pu1_pred_vert = pu1_pred_vert + src_strd; 2486 2487 //Core Loop: Process all the rows. 2488 do 2489 { 2490 src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); 2491 src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd)); 2492 2493 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 2494 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 2495 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 2496 2497 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2498 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2499 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2500 2501 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2502 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2503 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2504 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2505 2506 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 2507 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 2508 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 2509 2510 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2511 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2512 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2513 2514 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2515 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2516 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2517 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2518 2519 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); 2520 2521 _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b); 2522 2523 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); 2524 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); 2525 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); 2526 2527 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2528 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2529 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2530 2531 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2532 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2533 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2534 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2535 2536 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); 2537 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); 2538 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); 2539 2540 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2541 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2542 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2543 2544 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2545 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); 2546 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); 2547 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. 2548 2549 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); 2550 2551 _mm_storeu_si128((__m128i *)(pu1_tmp1 + 16), res_16x8b); 2552 2553 src_r0_16x8b = src_r2_16x8b; 2554 src_r1_16x8b = src_r3_16x8b; 2555 src_r2_16x8b = src_r4_16x8b; 2556 src_r3_16x8b = src_r5_16x8b; 2557 src_r4_16x8b = src_r6_16x8b; 2558 2559 ht_temp -= 2; 2560 pu1_pred_vert += src_strd << 1; 2561 pu1_tmp1 += 32; 2562 } 2563 while(ht_temp > 0); 2564 } 2565 //horizontal q-pel filter 2566 { 2567 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 2568 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 2569 __m128i src_vpel_16x8b; 2570 2571 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 2572 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 2573 __m128i res_16x8b; 2574 2575 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 2576 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 2577 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. 2578 2579 do 2580 { 2581 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 2582 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 2583 src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2)); 2584 2585 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 2586 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 2587 2588 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 2589 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 2590 2591 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 2592 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 2593 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 2594 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 2595 2596 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 2597 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 2598 2599 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 2600 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 2601 2602 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 2603 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 2604 2605 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 2606 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 2607 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 2608 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 2609 2610 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 2611 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 2612 2613 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 2614 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 2615 2616 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 2617 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 2618 2619 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 2620 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 2621 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 2622 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 2623 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 2624 res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b); 2625 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 2626 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. 2627 2628 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 2629 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 2630 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, const_val16_8x16b); 2631 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); //shifting right by 5 bits. 2632 2633 res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); 2634 2635 res_16x8b = _mm_avg_epu8(res_16x8b, src_vpel_16x8b); 2636 _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b); 2637 2638 ht --; 2639 pu1_pred_horiz += src_strd; 2640 pu1_dst += dst_strd; 2641 pu1_tmp2 += 16; 2642 } 2643 while(ht > 0); 2644 } 2645 } 2646 } 2647 2648 /*****************************************************************************/ 2649 /* */ 2650 /* Function Name : ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3 */ 2651 /* */ 2652 /* Description : This function implements a six-tap filter vertically and */ 2653 /* horizontally on ht x wd block separately and averages */ 2654 /* the two sets of values to calculate values at (1/4,1/2), */ 2655 /* or (3/4, 1/2) as mentioned in sec. 8.4.2.2.1 titled */ 2656 /* "Luma sample interpolation process". (ht,wd) can be */ 2657 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 2658 /* */ 2659 /* Inputs : puc_src - pointer to source */ 2660 /* puc_dst - pointer to destination */ 2661 /* src_strd - stride for source */ 2662 /* dst_strd - stride for destination */ 2663 /* ht - height of the block */ 2664 /* wd - width of the block */ 2665 /* pu1_tmp - pointer to temporary buffer */ 2666 /* dydx - x and y reference offset for q-pel */ 2667 /* calculations */ 2668 /* */ 2669 /* Issues : None */ 2670 /* */ 2671 /* Revision History: */ 2672 /* */ 2673 /* DD MM YYYY Author(s) Changes */ 2674 /* 13 02 2015 Kaushik Initial Version */ 2675 /* Senthoor */ 2676 /* */ 2677 /*****************************************************************************/ 2678 void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src, 2679 UWORD8 *pu1_dst, 2680 WORD32 src_strd, 2681 WORD32 dst_strd, 2682 WORD32 ht, 2683 WORD32 wd, 2684 UWORD8* pu1_tmp, 2685 WORD32 dydx) 2686 { 2687 WORD32 ht_temp; 2688 WORD32 x_offset; 2689 WORD32 off0,off1, off2, off3, off4, off5; 2690 WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3; 2691 2692 ht_temp = ht; 2693 x_offset = dydx & 0x3; 2694 pi2_temp1 = (WORD16 *)pu1_tmp; 2695 pi2_temp2 = pi2_temp1; 2696 pi2_temp3 = pi2_temp1 + (x_offset >> 1); 2697 2698 pu1_src -= 2 * src_strd; 2699 pu1_src -= 2; 2700 pi2_temp3 += 2; 2701 //the filter input starts from x[-2] (till x[3]) 2702 2703 if(wd == 4) 2704 { 2705 //vertical half-pel 2706 { 2707 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; 2708 __m128i src_r5_16x8b, src_r6_16x8b; 2709 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 2710 2711 __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 2712 2713 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 2714 2715 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 2716 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 2717 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 2718 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 2719 off0 = -((src_strd << 2) + src_strd) + 8; 2720 off1 = -(src_strd << 2) + 8; 2721 off2 = -((src_strd << 1) + src_strd) + 8; 2722 off3 = -(src_strd << 1) + 8; 2723 off4 = -src_strd + 8; 2724 off5 = 8; 2725 2726 //epilogue: Load all the pred rows except sixth and seventh row for the 2727 //first and second row processing. 2728 src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2729 pu1_src = pu1_src + src_strd; 2730 2731 src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2732 pu1_src = pu1_src + src_strd; 2733 2734 src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2735 pu1_src = pu1_src + src_strd; 2736 2737 src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2738 pu1_src = pu1_src + src_strd; 2739 2740 src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2741 pu1_src = pu1_src + src_strd; 2742 2743 //Core Loop: Process all the rows. 2744 do 2745 { 2746 src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2747 2748 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 2749 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 2750 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 2751 2752 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2753 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2754 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2755 2756 res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b); 2757 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2758 2759 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); 2760 2761 pi2_temp1[8] = pu1_src[off0] + pu1_src[off5] 2762 - (pu1_src[off1] + pu1_src[off4]) 2763 + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2) 2764 + ((pu1_src[off2] + pu1_src[off3]) << 4); 2765 2766 pu1_src = pu1_src + src_strd; 2767 pi2_temp1 = pi2_temp1 + 9; 2768 2769 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2770 2771 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); 2772 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); 2773 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); 2774 2775 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2776 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2777 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2778 2779 res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b); 2780 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2781 2782 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); 2783 2784 pi2_temp1[8] = pu1_src[off0] + pu1_src[off5] 2785 - (pu1_src[off1] + pu1_src[off4]) 2786 + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2) 2787 + ((pu1_src[off2] + pu1_src[off3]) << 4); 2788 2789 ht_temp -= 2; 2790 pu1_src = pu1_src + src_strd; 2791 pi2_temp1 = pi2_temp1 + 9; 2792 2793 src_r0_16x8b = src_r2_16x8b; 2794 src_r1_16x8b = src_r3_16x8b; 2795 src_r2_16x8b = src_r4_16x8b; 2796 src_r3_16x8b = src_r5_16x8b; 2797 src_r4_16x8b = src_r6_16x8b; 2798 } 2799 while(ht_temp > 0); 2800 } 2801 2802 //horizontal q-pel 2803 { 2804 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b; 2805 __m128i src_r3_8x16b, src_r4_8x16b, src_r5_8x16b; 2806 __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b; 2807 __m128i src_hpel_16x8b, src_hpel_8x16b; 2808 2809 __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 2810 __m128i res_8x16b, res_16x8b; 2811 2812 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 2813 __m128i const_val512_4x32b, const_val16_8x16b; 2814 2815 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 2816 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 2817 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 2818 2819 const_val512_4x32b = _mm_set1_epi32(512); 2820 const_val16_8x16b = _mm_set1_epi16(16); 2821 2822 do 2823 { 2824 src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2)); 2825 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1)); 2826 src_r2_8x16b = _mm_srli_si128(src_r1_8x16b, 2); 2827 src_r3_8x16b = _mm_srli_si128(src_r1_8x16b, 4); 2828 src_r4_8x16b = _mm_srli_si128(src_r1_8x16b, 6); 2829 src_r5_8x16b = _mm_srli_si128(src_r1_8x16b, 8); 2830 2831 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 2832 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 2833 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 2834 2835 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); 2836 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); 2837 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); 2838 2839 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 2840 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); 2841 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 2842 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 2843 2844 res_8x16b = _mm_packs_epi32(res_t1_4x32b, res_t1_4x32b); 2845 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 2846 2847 src_hpel_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp3)); 2848 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 2849 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 2850 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 2851 2852 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 2853 2854 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); 2855 2856 ht--; 2857 pi2_temp2 = pi2_temp2 + 4 + 5; 2858 pi2_temp3 = pi2_temp3 + 4 + 5; 2859 pu1_dst = pu1_dst + dst_strd; 2860 } 2861 while(ht > 0); 2862 } 2863 } 2864 else if(wd == 8) 2865 { 2866 // vertical half-pel 2867 { 2868 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; 2869 __m128i src_r5_16x8b, src_r6_16x8b; 2870 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 2871 2872 __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 2873 2874 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 2875 2876 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 2877 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 2878 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 2879 2880 //epilogue: Load all the pred rows except sixth and seventh row for the 2881 //first and second row processing. 2882 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 2883 pu1_src = pu1_src + src_strd; 2884 2885 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 2886 pu1_src = pu1_src + src_strd; 2887 2888 src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 2889 pu1_src = pu1_src + src_strd; 2890 2891 src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 2892 pu1_src = pu1_src + src_strd; 2893 2894 src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 2895 pu1_src = pu1_src + src_strd; 2896 2897 //Core Loop: Process all the rows. 2898 do 2899 { 2900 src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 2901 src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 2902 2903 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 2904 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 2905 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 2906 2907 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2908 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2909 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2910 2911 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2912 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2913 2914 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); 2915 2916 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 2917 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 2918 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 2919 2920 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2921 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2922 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2923 2924 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2925 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2926 2927 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b); 2928 2929 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b); 2930 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b); 2931 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); 2932 2933 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2934 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2935 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2936 2937 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2938 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2939 2940 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5), res_t1_8x16b); 2941 2942 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b); 2943 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b); 2944 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); 2945 2946 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 2947 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 2948 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 2949 2950 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 2951 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 2952 2953 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5 + 8), res_t1_8x16b); 2954 2955 src_r0_16x8b = src_r2_16x8b; 2956 src_r1_16x8b = src_r3_16x8b; 2957 src_r2_16x8b = src_r4_16x8b; 2958 src_r3_16x8b = src_r5_16x8b; 2959 src_r4_16x8b = src_r6_16x8b; 2960 2961 ht_temp -= 2; 2962 pu1_src = pu1_src + (src_strd << 1); 2963 pi2_temp1 = pi2_temp1 + (13 << 1); 2964 } 2965 while(ht_temp > 0); 2966 } 2967 // horizontal q-pel 2968 { 2969 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; 2970 __m128i src_r4_8x16b, src_r5_8x16b; 2971 __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b; 2972 __m128i src_r0r1_c1_8x16b, src_r2r3_c1_8x16b, src_r4r5_c1_8x16b; 2973 __m128i src_hpel_8x16b, src_hpel_16x8b; 2974 2975 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 2976 __m128i res_8x16b, res_16x8b; 2977 2978 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 2979 __m128i const_val512_4x32b, const_val16_8x16b; 2980 2981 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 2982 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 2983 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 2984 2985 const_val512_4x32b = _mm_set1_epi32(512); 2986 const_val16_8x16b = _mm_set1_epi16(16); 2987 2988 do 2989 { 2990 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); 2991 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1)); 2992 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2)); 2993 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3)); 2994 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4)); 2995 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5)); 2996 2997 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 2998 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 2999 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3000 3001 src_r0r1_c1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 3002 src_r2r3_c1_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 3003 src_r4r5_c1_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 3004 3005 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); 3006 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); 3007 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); 3008 3009 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3010 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3011 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3012 3013 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3014 3015 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c1_8x16b, coeff0_1_8x16b); 3016 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c1_8x16b, coeff2_3_8x16b); 3017 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c1_8x16b, coeff4_5_8x16b); 3018 3019 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3020 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3021 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3022 3023 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3024 3025 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3026 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3027 3028 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3)); 3029 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 3030 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3031 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3032 3033 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3034 3035 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); 3036 3037 ht--; 3038 pi2_temp2 = pi2_temp2 + 8 + 5; 3039 pi2_temp3 = pi2_temp3 + 8 + 5; 3040 pu1_dst = pu1_dst + dst_strd; 3041 } 3042 while(ht > 0); 3043 } 3044 } 3045 else // wd == 16 3046 { 3047 // vertical half-pel 3048 { 3049 __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; 3050 __m128i src_r4_16x8b, src_r5_16x8b; 3051 __m128i src_r0_c2_16x8b, src_r1_c2_16x8b, src_r2_c2_16x8b, src_r3_c2_16x8b; 3052 __m128i src_r4_c2_16x8b, src_r5_c2_16x8b; 3053 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; 3054 3055 __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; 3056 3057 __m128i coeff0_1_16x8b,coeff2_3_16x8b,coeff4_5_16x8b; 3058 3059 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 3060 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 3061 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 3062 3063 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 3064 src_r0_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 3065 pu1_src = pu1_src + src_strd; 3066 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 3067 src_r1_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 3068 pu1_src = pu1_src + src_strd; 3069 src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 3070 src_r2_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 3071 pu1_src = pu1_src + src_strd; 3072 src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 3073 src_r3_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 3074 pu1_src = pu1_src + src_strd; 3075 src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 3076 src_r4_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 3077 pu1_src = pu1_src + src_strd; 3078 3079 //Core Loop: Process all the rows. 3080 do 3081 { 3082 src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); 3083 src_r5_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 3084 3085 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); 3086 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); 3087 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); 3088 3089 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 3090 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 3091 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 3092 3093 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 3094 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 3095 3096 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b); 3097 3098 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); 3099 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); 3100 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); 3101 3102 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 3103 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 3104 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 3105 3106 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 3107 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 3108 3109 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b); 3110 3111 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_c2_16x8b, src_r1_c2_16x8b); 3112 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_c2_16x8b, src_r3_c2_16x8b); 3113 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_c2_16x8b, src_r5_c2_16x8b); 3114 3115 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); 3116 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); 3117 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); 3118 3119 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); 3120 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); 3121 3122 _mm_storeu_si128((__m128i *)(pi2_temp1 + 16), res_t1_8x16b); 3123 3124 src_r0_16x8b = src_r1_16x8b; 3125 src_r1_16x8b = src_r2_16x8b; 3126 src_r2_16x8b = src_r3_16x8b; 3127 src_r3_16x8b = src_r4_16x8b; 3128 src_r4_16x8b = src_r5_16x8b; 3129 3130 src_r0_c2_16x8b = src_r1_c2_16x8b; 3131 src_r1_c2_16x8b = src_r2_c2_16x8b; 3132 src_r2_c2_16x8b = src_r3_c2_16x8b; 3133 src_r3_c2_16x8b = src_r4_c2_16x8b; 3134 src_r4_c2_16x8b = src_r5_c2_16x8b; 3135 3136 ht_temp--; 3137 pu1_src = pu1_src + src_strd; 3138 pi2_temp1 = pi2_temp1 + 16 + 5; 3139 } 3140 while(ht_temp > 0); 3141 } 3142 // horizontal q-pel 3143 { 3144 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; 3145 __m128i src_r4_8x16b, src_r5_8x16b; 3146 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; 3147 __m128i src_hpel1_8x16b, src_hpel2_8x16b, src_hpel_16x8b; 3148 3149 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 3150 __m128i res_c0_8x16b, res_c1_8x16b, res_16x8b; 3151 3152 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 3153 __m128i const_val512_4x32b, const_val16_8x16b; 3154 3155 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 3156 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 3157 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 3158 3159 const_val512_4x32b = _mm_set1_epi32(512); 3160 const_val16_8x16b = _mm_set1_epi16(16); 3161 3162 do 3163 { 3164 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); 3165 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1)); 3166 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2)); 3167 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3)); 3168 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4)); 3169 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5)); 3170 3171 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 3172 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 3173 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3174 3175 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3176 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3177 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3178 3179 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3180 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3181 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3182 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3183 3184 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 3185 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 3186 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 3187 3188 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3189 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3190 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3191 3192 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3193 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); 3194 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3195 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3196 3197 res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3198 3199 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8)); 3200 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 1)); 3201 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 2)); 3202 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 3)); 3203 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 4)); 3204 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 5)); 3205 3206 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 3207 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 3208 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3209 3210 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3211 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3212 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3213 3214 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3215 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); 3216 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3217 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10); 3218 3219 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 3220 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 3221 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 3222 3223 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3224 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3225 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3226 3227 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3228 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b); 3229 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3230 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3231 3232 res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3233 res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b); 3234 3235 src_hpel1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3)); 3236 src_hpel1_8x16b = _mm_add_epi16(src_hpel1_8x16b, const_val16_8x16b); 3237 src_hpel1_8x16b = _mm_srai_epi16(src_hpel1_8x16b, 5); //shifting right by 5 bits. 3238 3239 src_hpel2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8)); 3240 src_hpel2_8x16b = _mm_add_epi16(src_hpel2_8x16b, const_val16_8x16b); 3241 src_hpel2_8x16b = _mm_srai_epi16(src_hpel2_8x16b, 5); //shifting right by 5 bits. 3242 3243 src_hpel_16x8b = _mm_packus_epi16(src_hpel1_8x16b, src_hpel2_8x16b); 3244 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3245 3246 _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b); 3247 3248 ht--; 3249 pi2_temp2 = pi2_temp2 + 16 + 5; 3250 pi2_temp3 = pi2_temp3 + 16 + 5; 3251 pu1_dst = pu1_dst + dst_strd; 3252 } 3253 while(ht > 0); 3254 } 3255 } 3256 } 3257 3258 /*****************************************************************************/ 3259 /* */ 3260 /* Function Name : ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3 */ 3261 /* */ 3262 /* Description : This function implements a six-tap filter vertically and */ 3263 /* horizontally on ht x wd block separately and averages */ 3264 /* the two sets of values to calculate values at (1/2,1/4), */ 3265 /* or (1/2, 3/4) as mentioned in sec. 8.4.2.2.1 titled */ 3266 /* "Luma sample interpolation process". (ht,wd) can be */ 3267 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 3268 /* */ 3269 /* Inputs : puc_src - pointer to source */ 3270 /* puc_dst - pointer to destination */ 3271 /* src_strd - stride for source */ 3272 /* dst_strd - stride for destination */ 3273 /* ht - height of the block */ 3274 /* wd - width of the block */ 3275 /* pu1_tmp - pointer to temporary buffer */ 3276 /* dydx - x and y reference offset for q-pel */ 3277 /* calculations */ 3278 /* */ 3279 /* Issues : None */ 3280 /* */ 3281 /* Revision History: */ 3282 /* */ 3283 /* DD MM YYYY Author(s) Changes */ 3284 /* 13 02 2015 Kaushik Initial Version */ 3285 /* Senthoor */ 3286 /* */ 3287 /*****************************************************************************/ 3288 void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src, 3289 UWORD8 *pu1_dst, 3290 WORD32 src_strd, 3291 WORD32 dst_strd, 3292 WORD32 ht, 3293 WORD32 wd, 3294 UWORD8* pu1_tmp, 3295 WORD32 dydx) 3296 { 3297 WORD32 ht_temp; 3298 WORD32 y_offset; 3299 WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3; 3300 3301 y_offset = (dydx & 0xf) >> 2; 3302 pi2_temp1 = (WORD16 *)pu1_tmp; 3303 pi2_temp2 = pi2_temp1; 3304 pi2_temp3 = pi2_temp1 + (y_offset >> 1) * wd; 3305 3306 ht_temp = ht + 5; 3307 pu1_src -= src_strd << 1; 3308 pu1_src -= 2; 3309 pi2_temp3 += wd << 1; 3310 //the filter input starts from x[-2] (till x[3]) 3311 3312 if(wd == 4) 3313 { 3314 // horizontal half-pel 3315 { 3316 __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_t1_16x8b; 3317 __m128i src_r0_sht_16x8b, src_r1_sht_16x8b; 3318 __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b; 3319 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 3320 3321 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 3322 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 3323 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 3324 3325 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 3326 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 3327 3328 do 3329 { 3330 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 3331 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 3332 3333 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 3334 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 3335 3336 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 3337 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 3338 3339 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 3340 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 3341 //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 3342 3343 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 3344 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 3345 3346 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 3347 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 3348 //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 3349 3350 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 3351 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4); //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 3352 3353 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 3354 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 3355 //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 3356 3357 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b); 3358 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); 3359 3360 3361 _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0r1_t1_8x16b); 3362 3363 ht_temp -= 2; 3364 pu1_src = pu1_src + (src_strd << 1); 3365 pi2_temp1 = pi2_temp1 + (4 << 1); 3366 } 3367 while(ht_temp > 0); 3368 } 3369 // vertical q-pel 3370 { 3371 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; 3372 __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b; 3373 __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b; 3374 __m128i src_hpel_16x8b, src_hpel_8x16b; 3375 3376 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 3377 __m128i res_8x16b, res_16x8b; 3378 3379 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 3380 __m128i const_val512_4x32b, const_val16_8x16b; 3381 3382 const_val512_4x32b = _mm_set1_epi32(512); 3383 const_val16_8x16b = _mm_set1_epi16(16); 3384 3385 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 3386 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 3387 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 3388 3389 src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2)); 3390 src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4)); 3391 src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 8)); 3392 src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 12)); 3393 src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 16)); 3394 pi2_temp2 += 20; 3395 3396 do 3397 { 3398 src_r5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2)); 3399 src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4)); 3400 3401 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 3402 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 3403 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3404 3405 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); 3406 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); 3407 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); 3408 3409 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3410 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3411 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3412 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3413 3414 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 3415 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 3416 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 3417 3418 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b); 3419 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b); 3420 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b); 3421 3422 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3423 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3424 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3425 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3426 3427 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3428 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3429 3430 src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3); 3431 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 3432 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3433 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3434 3435 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3436 3437 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); 3438 res_16x8b = _mm_srli_si128(res_16x8b, 4); 3439 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); 3440 3441 src_r0_8x16b = src_r2_8x16b; 3442 src_r1_8x16b = src_r3_8x16b; 3443 src_r2_8x16b = src_r4_8x16b; 3444 src_r3_8x16b = src_r5_8x16b; 3445 src_r4_8x16b = src_r6_8x16b; 3446 3447 ht -= 2; 3448 pi2_temp2 = pi2_temp2 + (4 << 1); 3449 pi2_temp3 = pi2_temp3 + (4 << 1); 3450 pu1_dst = pu1_dst + (dst_strd << 1); 3451 } 3452 while(ht > 0); 3453 } 3454 } 3455 else if(wd == 8) 3456 { 3457 // horizontal half-pel 3458 { 3459 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 3460 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 3461 3462 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 3463 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 3464 3465 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 3466 3467 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 3468 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 3469 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 3470 3471 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 3472 //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 3473 3474 do 3475 { 3476 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 3477 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 3478 3479 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 3480 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 3481 3482 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 3483 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 3484 3485 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 3486 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 3487 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 3488 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 3489 3490 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 3491 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 3492 3493 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 3494 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 3495 3496 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 3497 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 3498 3499 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 3500 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 3501 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 3502 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 3503 3504 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 3505 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 3506 3507 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 3508 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 3509 3510 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 3511 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 3512 3513 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 3514 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 3515 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 3516 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 3517 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 3518 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 3519 3520 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 3521 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 3522 3523 _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b); 3524 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b); 3525 3526 ht_temp -= 2; 3527 pu1_src = pu1_src + (src_strd << 1); 3528 pi2_temp1 = pi2_temp1 + (8 << 1); 3529 } 3530 while(ht_temp > 0); 3531 } 3532 // vertical q-pel 3533 { 3534 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; 3535 __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b; 3536 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; 3537 __m128i src_hpel_8x16b, src_hpel_16x8b; 3538 3539 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 3540 __m128i res_8x16b, res_16x8b; 3541 3542 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 3543 __m128i const_val512_4x32b, const_val16_8x16b; 3544 3545 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 3546 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 3547 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 3548 3549 const_val512_4x32b = _mm_set1_epi32(512); 3550 const_val16_8x16b = _mm_set1_epi16(16); 3551 3552 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); 3553 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8)); 3554 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); 3555 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 24)); 3556 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32)); 3557 pi2_temp2 += 40; 3558 3559 do 3560 { 3561 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); 3562 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8)); 3563 3564 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 3565 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 3566 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3567 3568 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3569 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3570 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3571 3572 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3573 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3574 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3575 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3576 3577 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 3578 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 3579 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 3580 3581 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3582 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3583 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3584 3585 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3586 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3587 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3588 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3589 3590 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3591 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3592 3593 src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3); 3594 src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b); 3595 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3596 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3597 3598 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3599 3600 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); 3601 3602 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 3603 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 3604 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 3605 3606 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3607 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3608 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3609 3610 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3611 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3612 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3613 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3614 3615 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); 3616 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); 3617 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); 3618 3619 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3620 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3621 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3622 3623 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3624 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3625 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3626 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3627 3628 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3629 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3630 3631 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8)); 3632 src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b); 3633 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3634 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3635 3636 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3637 3638 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 3639 3640 src_r0_8x16b = src_r2_8x16b; 3641 src_r1_8x16b = src_r3_8x16b; 3642 src_r2_8x16b = src_r4_8x16b; 3643 src_r3_8x16b = src_r5_8x16b; 3644 src_r4_8x16b = src_r6_8x16b; 3645 3646 ht -= 2; 3647 pi2_temp2 = pi2_temp2 + (8 << 1); 3648 pi2_temp3 = pi2_temp3 + (8 << 1); 3649 pu1_dst = pu1_dst + (dst_strd << 1); 3650 } 3651 while(ht > 0); 3652 } 3653 } 3654 else // wd == 16 3655 { 3656 UWORD8 *pu1_dst1; 3657 WORD16 *pi2_temp4,*pi2_temp5; 3658 3659 pu1_dst1 = pu1_dst + 8; 3660 pi2_temp4 = pi2_temp2 + 8; 3661 pi2_temp5 = pi2_temp3 + 8; 3662 3663 // horizontal half-pel 3664 { 3665 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; 3666 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; 3667 3668 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; 3669 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; 3670 3671 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; 3672 3673 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 3674 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 3675 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 3676 3677 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... 3678 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... 3679 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. 3680 3681 do 3682 { 3683 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 3684 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 3685 3686 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 3687 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 3688 3689 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 3690 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 3691 3692 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 3693 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 3694 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 3695 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 3696 3697 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 3698 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 3699 3700 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 3701 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 3702 3703 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 3704 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 3705 3706 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 3707 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 3708 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 3709 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 3710 3711 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 3712 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 3713 3714 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 3715 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 3716 3717 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 3718 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 3719 3720 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 3721 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 3722 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 3723 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 3724 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); 3725 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); 3726 3727 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); 3728 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); 3729 3730 _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b); 3731 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b); 3732 3733 ht_temp--; 3734 pu1_src = pu1_src + src_strd; 3735 pi2_temp1 = pi2_temp1 + 16; 3736 } 3737 while(ht_temp > 0); 3738 } 3739 // vertical q-pel 3740 { 3741 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b; 3742 __m128i src_r5_8x16b, src_r6_8x16b; 3743 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; 3744 __m128i src_hpel_8x16b, src_hpel_16x8b; 3745 3746 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; 3747 __m128i res_8x16b, res_16x8b; 3748 3749 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; 3750 __m128i const_val512_4x32b, const_val16_8x16b; 3751 3752 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); 3753 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); 3754 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); 3755 3756 const_val512_4x32b = _mm_set1_epi32(512); 3757 const_val16_8x16b = _mm_set1_epi16(16); 3758 3759 /**********************************************************/ 3760 /* Do first height x 8 block */ 3761 /**********************************************************/ 3762 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); 3763 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); 3764 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32)); 3765 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48)); 3766 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64)); 3767 pi2_temp2 += 80; 3768 3769 ht_temp = ht; 3770 do 3771 { 3772 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2)); 3773 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16)); 3774 3775 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 3776 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 3777 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3778 3779 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3780 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3781 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3782 3783 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3784 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3785 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3786 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3787 3788 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 3789 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 3790 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 3791 3792 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3793 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3794 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3795 3796 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3797 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3798 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3799 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3800 3801 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3802 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3803 3804 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3)); 3805 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 3806 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3807 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3808 3809 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3810 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b); 3811 3812 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 3813 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 3814 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 3815 3816 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3817 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3818 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3819 3820 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3821 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3822 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3823 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3824 3825 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); 3826 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); 3827 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); 3828 3829 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3830 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3831 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3832 3833 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3834 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3835 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3836 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3837 3838 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3839 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3840 3841 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 16)); 3842 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 3843 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3844 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3845 3846 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3847 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b); 3848 3849 src_r0_8x16b = src_r2_8x16b; 3850 src_r1_8x16b = src_r3_8x16b; 3851 src_r2_8x16b = src_r4_8x16b; 3852 src_r3_8x16b = src_r5_8x16b; 3853 src_r4_8x16b = src_r6_8x16b; 3854 3855 ht_temp -= 2; 3856 pi2_temp3 = pi2_temp3 + (16 << 1); 3857 pi2_temp2 = pi2_temp2 + (16 << 1); 3858 pu1_dst = pu1_dst + (dst_strd << 1); 3859 } 3860 while(ht_temp > 0); 3861 3862 /**********************************************************/ 3863 /* Do second height * 8 block */ 3864 /**********************************************************/ 3865 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4)); 3866 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16)); 3867 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 32)); 3868 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 48)); 3869 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 64)); 3870 pi2_temp4 += 80; 3871 3872 do 3873 { 3874 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4)); 3875 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16)); 3876 3877 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); 3878 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); 3879 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); 3880 3881 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3882 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3883 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3884 3885 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3886 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3887 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3888 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3889 3890 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); 3891 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); 3892 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); 3893 3894 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3895 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3896 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3897 3898 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3899 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3900 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3901 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3902 3903 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3904 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3905 3906 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5)); 3907 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 3908 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3909 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3910 3911 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3912 _mm_storel_epi64((__m128i *)(pu1_dst1), res_16x8b); 3913 3914 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b); 3915 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b); 3916 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b); 3917 3918 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3919 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3920 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3921 3922 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3923 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3924 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3925 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3926 3927 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b); 3928 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b); 3929 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b); 3930 3931 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); 3932 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); 3933 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); 3934 3935 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); 3936 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); 3937 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); 3938 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); 3939 3940 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); 3941 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b); 3942 3943 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5 + 16)); 3944 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b); 3945 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits. 3946 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b); 3947 3948 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b); 3949 _mm_storel_epi64((__m128i *)(pu1_dst1 + dst_strd), res_16x8b); 3950 3951 src_r0_8x16b = src_r2_8x16b; 3952 src_r1_8x16b = src_r3_8x16b; 3953 src_r2_8x16b = src_r4_8x16b; 3954 src_r3_8x16b = src_r5_8x16b; 3955 src_r4_8x16b = src_r6_8x16b; 3956 3957 ht -= 2; 3958 pi2_temp5 = pi2_temp5 + (16 << 1); 3959 pi2_temp4 = pi2_temp4 + (16 << 1); 3960 pu1_dst1 = pu1_dst1 + (dst_strd << 1); 3961 } 3962 while(ht > 0); 3963 } 3964 } 3965 } 3966 3967 /*****************************************************************************/ 3968 /* */ 3969 /* Function Name : ih264_inter_pred_chroma_ssse3 */ 3970 /* */ 3971 /* Description : This function implements a four-tap 2D filter as */ 3972 /* mentioned in sec. 8.4.2.2.2 titled "Chroma sample */ 3973 /* "interpolation process". (ht,wd) can be (2,2), (4,2), */ 3974 /* (2,4), (4,4), (8,4), (4,8) or (8,8). */ 3975 /* */ 3976 /* Inputs : puc_src - pointer to source */ 3977 /* puc_dst - pointer to destination */ 3978 /* src_strd - stride for source */ 3979 /* dst_strd - stride for destination */ 3980 /* dx - x position of destination value */ 3981 /* dy - y position of destination value */ 3982 /* ht - height of the block */ 3983 /* wd - width of the block */ 3984 /* */ 3985 /* Issues : None */ 3986 /* */ 3987 /* Revision History: */ 3988 /* */ 3989 /* DD MM YYYY Author(s) Changes */ 3990 /* 13 02 2015 Kaushik Initial Version */ 3991 /* Senthoor */ 3992 /* */ 3993 /*****************************************************************************/ 3994 void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src, 3995 UWORD8 *pu1_dst, 3996 WORD32 src_strd, 3997 WORD32 dst_strd, 3998 WORD32 dx, 3999 WORD32 dy, 4000 WORD32 ht, 4001 WORD32 wd) 4002 { 4003 WORD32 i, j, A, B, C, D; 4004 4005 i = 8 - dx; 4006 j = 8 - dy; 4007 4008 A = i * j; 4009 B = dx * j; 4010 C = i * dy; 4011 D = dx * dy; 4012 4013 if(wd == 2) 4014 { 4015 WORD32 tmp1, tmp2, tmp3, tmp4; 4016 4017 do 4018 { 4019 //U 4020 tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2]; 4021 tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4]; 4022 //V 4023 tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3]; 4024 tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5]; 4025 4026 tmp1 = (tmp1 + 32) >> 6; 4027 tmp2 = (tmp2 + 32) >> 6; 4028 tmp3 = (tmp3 + 32) >> 6; 4029 tmp4 = (tmp4 + 32) >> 6; 4030 4031 pu1_dst[0] = CLIP_U8(tmp1); 4032 pu1_dst[2] = CLIP_U8(tmp2); 4033 pu1_dst[1] = CLIP_U8(tmp3); 4034 pu1_dst[3] = CLIP_U8(tmp4); 4035 4036 pu1_src += src_strd; 4037 pu1_dst += dst_strd; 4038 4039 tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2]; 4040 tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4]; 4041 tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3]; 4042 tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5]; 4043 4044 tmp1 = (tmp1 + 32) >> 6; 4045 tmp2 = (tmp2 + 32) >> 6; 4046 tmp3 = (tmp3 + 32) >> 6; 4047 tmp4 = (tmp4 + 32) >> 6; 4048 4049 pu1_dst[0] = CLIP_U8(tmp1); 4050 pu1_dst[2] = CLIP_U8(tmp2); 4051 pu1_dst[1] = CLIP_U8(tmp3); 4052 pu1_dst[3] = CLIP_U8(tmp4); 4053 4054 ht -= 2; 4055 pu1_src += src_strd; 4056 pu1_dst += dst_strd; 4057 } 4058 while(ht > 0); 4059 4060 } 4061 else if(wd == 4) 4062 { 4063 WORD32 AB, CD; 4064 4065 __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; 4066 __m128i res1_AB_8x16b, res1_CD_8x16b, res1_8x16b, res1_16x8b; 4067 __m128i res2_AB_8x16b, res2_CD_8x16b, res2_8x16b, res2_16x8b; 4068 4069 __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b; 4070 __m128i const_shuff_16x8b; 4071 4072 AB = (B << 8) + A; 4073 CD = (D << 8) + C; 4074 4075 coeffAB_16x8b = _mm_set1_epi16(AB); 4076 coeffCD_16x8b = _mm_set1_epi16(CD); 4077 4078 round_add32_8x16b = _mm_set1_epi16(32); 4079 4080 const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806); 4081 4082 src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4083 src_r1_16x8b = _mm_shuffle_epi8(src_r1_16x8b, const_shuff_16x8b); 4084 pu1_src += src_strd; 4085 4086 do 4087 { 4088 src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4089 src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 4090 4091 src_r2_16x8b = _mm_shuffle_epi8(src_r2_16x8b, const_shuff_16x8b); 4092 src_r3_16x8b = _mm_shuffle_epi8(src_r3_16x8b, const_shuff_16x8b); 4093 4094 res1_AB_8x16b = _mm_maddubs_epi16(src_r1_16x8b, coeffAB_16x8b); 4095 res1_CD_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffCD_16x8b); 4096 res2_AB_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffAB_16x8b); 4097 res2_CD_8x16b = _mm_maddubs_epi16(src_r3_16x8b, coeffCD_16x8b); 4098 4099 res1_8x16b = _mm_add_epi16(res1_AB_8x16b, res1_CD_8x16b); 4100 res2_8x16b = _mm_add_epi16(res2_AB_8x16b, res2_CD_8x16b); 4101 res1_8x16b = _mm_add_epi16(res1_8x16b, round_add32_8x16b); 4102 res2_8x16b = _mm_add_epi16(res2_8x16b, round_add32_8x16b); 4103 4104 res1_8x16b = _mm_srai_epi16(res1_8x16b, 6); 4105 res2_8x16b = _mm_srai_epi16(res2_8x16b, 6); 4106 4107 res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); 4108 res2_16x8b = _mm_packus_epi16(res2_8x16b, res2_8x16b); 4109 4110 _mm_storel_epi64((__m128i *)pu1_dst, res1_16x8b); 4111 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 4112 4113 src_r1_16x8b = src_r3_16x8b; 4114 4115 ht -= 2; 4116 pu1_src += src_strd << 1; 4117 pu1_dst += dst_strd << 1; 4118 } 4119 while(ht > 0); 4120 } 4121 else // wd == 8 4122 { 4123 WORD32 AB, CD; 4124 4125 __m128i src_r1l_16x8b, src_r2l_16x8b; 4126 __m128i src_r1h_16x8b, src_r2h_16x8b; 4127 4128 __m128i res_l_AB_8x16b, res_l_CD_8x16b; 4129 __m128i res_h_AB_8x16b, res_h_CD_8x16b; 4130 __m128i res_l_8x16b, res_h_8x16b, res_16x8b; 4131 4132 __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b; 4133 __m128i const_shuff_16x8b; 4134 4135 AB = (B << 8) + A; 4136 CD = (D << 8) + C; 4137 4138 coeffAB_16x8b = _mm_set1_epi16(AB); 4139 coeffCD_16x8b = _mm_set1_epi16(CD); 4140 4141 round_add32_8x16b = _mm_set1_epi16(32); 4142 4143 const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806); 4144 4145 src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4146 src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); 4147 4148 src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b); 4149 src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b); 4150 4151 pu1_src += src_strd; 4152 4153 do 4154 { 4155 //row 1 4156 src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4157 src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); 4158 4159 src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b); 4160 src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b); 4161 4162 res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b); 4163 res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b); 4164 res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b); 4165 res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b); 4166 4167 res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); 4168 res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); 4169 res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); 4170 res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); 4171 4172 res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); 4173 res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); 4174 4175 res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); 4176 4177 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 4178 4179 pu1_src += src_strd; 4180 pu1_dst += dst_strd; 4181 4182 //row 2 4183 src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4184 src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); 4185 4186 src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b); 4187 src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b); 4188 4189 res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b); 4190 res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b); 4191 res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b); 4192 res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b); 4193 4194 res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); 4195 res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); 4196 res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); 4197 res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); 4198 4199 res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); 4200 res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); 4201 4202 res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); 4203 4204 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 4205 4206 pu1_src += src_strd; 4207 pu1_dst += dst_strd; 4208 4209 //row 3 4210 src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4211 src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); 4212 4213 src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b); 4214 src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b); 4215 4216 res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b); 4217 res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b); 4218 res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b); 4219 res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b); 4220 4221 res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); 4222 res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); 4223 res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); 4224 res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); 4225 4226 res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); 4227 res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); 4228 4229 res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); 4230 4231 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 4232 4233 pu1_src += src_strd; 4234 pu1_dst += dst_strd; 4235 4236 //row 1 4237 src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 4238 src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); 4239 4240 src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b); 4241 src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b); 4242 4243 res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b); 4244 res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b); 4245 res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b); 4246 res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b); 4247 4248 res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b); 4249 res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b); 4250 res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b); 4251 res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b); 4252 4253 res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6); 4254 res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6); 4255 4256 res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b); 4257 4258 _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b); 4259 4260 ht -= 4; 4261 pu1_src += src_strd; 4262 pu1_dst += dst_strd; 4263 } 4264 while(ht > 0); 4265 } 4266 } 4267