1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 21 /** 22 ******************************************************************************* 23 * @file 24 * impeg2_inter_pred_sse42_intr.c 25 * 26 * @brief 27 * Contains Motion compensation function definitions for MPEG2 decoder 28 * 29 * @author 30 * Mohit [100664] 31 * 32 * - impeg2_copy_mb_sse42() 33 * - impeg2_interpolate_sse42() 34 * - impeg2_mc_halfx_halfy_8x8_sse42() 35 * - impeg2_mc_halfx_fully_8x8_sse42() 36 * - impeg2_mc_fullx_halfy_8x8_sse42() 37 * - impeg2_mc_fullx_fully_8x8_sse42() 38 * 39 * @remarks 40 * None 41 * 42 ******************************************************************************* 43 */ 44 #include <stdio.h> 45 #include <string.h> 46 #include "iv_datatypedef.h" 47 #include "impeg2_macros.h" 48 #include "impeg2_defs.h" 49 #include "impeg2_inter_pred.h" 50 51 #include <immintrin.h> 52 #include <emmintrin.h> 53 #include <smmintrin.h> 54 #include <tmmintrin.h> 55 56 /******************************************************************************* 57 * Function Name : impeg2_copy_mb 58 * 59 * Description : copies 3 components to the frame from mc_buf 60 * 61 * Arguments : 62 * src_buf : Source Buffer 63 * dst_buf : Destination Buffer 64 * src_wd : Source Width 65 * dst_wd : destination Width 66 * 67 * Values Returned : None 68 *******************************************************************************/ 69 void impeg2_copy_mb_sse42(yuv_buf_t *src_buf, 70 yuv_buf_t *dst_buf, 71 UWORD32 src_wd, 72 UWORD32 dst_wd) 73 { 74 UWORD8 *src; 75 UWORD8 *dst; 76 __m128i src_r0, src_r1, src_r2, src_r3; 77 78 /*******************************************************/ 79 /* copy Y */ 80 /*******************************************************/ 81 src = src_buf->pu1_y; 82 dst = dst_buf->pu1_y; 83 // Row 0-3 84 src_r0 = _mm_loadu_si128((__m128i *) (src)); 85 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 86 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 87 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 88 89 _mm_storeu_si128((__m128i *) dst, src_r0); 90 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 91 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 92 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 93 94 // Row 4-7 95 src += 4 * src_wd; 96 dst += 4 * dst_wd; 97 src_r0 = _mm_loadu_si128((__m128i *) (src)); 98 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 99 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 100 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 101 102 _mm_storeu_si128((__m128i *) dst, src_r0); 103 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 104 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 105 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 106 107 // Row 8-11 108 src += 4 * src_wd; 109 dst += 4 * dst_wd; 110 src_r0 = _mm_loadu_si128((__m128i *) (src)); 111 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 112 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 113 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 114 115 _mm_storeu_si128((__m128i *) dst, src_r0); 116 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 117 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 118 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 119 120 // Row 12-15 121 src += 4 * src_wd; 122 dst += 4 * dst_wd; 123 src_r0 = _mm_loadu_si128((__m128i *) (src)); 124 src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); 125 src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); 126 src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); 127 128 _mm_storeu_si128((__m128i *) dst, src_r0); 129 _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); 130 _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); 131 _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); 132 133 src_wd >>= 1; 134 dst_wd >>= 1; 135 136 /*******************************************************/ 137 /* copy U */ 138 /*******************************************************/ 139 src = src_buf->pu1_u; 140 dst = dst_buf->pu1_u; 141 142 // Row 0-3 143 src_r0 = _mm_loadl_epi64((__m128i *)src); 144 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 145 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 146 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 147 148 _mm_storel_epi64((__m128i *)dst, src_r0); 149 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 150 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 151 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 152 153 // Row 4-7 154 src += 4 * src_wd; 155 dst += 4 * dst_wd; 156 157 src_r0 = _mm_loadl_epi64((__m128i *)src); 158 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 159 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 160 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 161 162 _mm_storel_epi64((__m128i *)dst, src_r0); 163 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 164 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 165 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 166 167 /*******************************************************/ 168 /* copy V */ 169 /*******************************************************/ 170 src = src_buf->pu1_v; 171 dst = dst_buf->pu1_v; 172 // Row 0-3 173 src_r0 = _mm_loadl_epi64((__m128i *)src); 174 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 175 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 176 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 177 178 _mm_storel_epi64((__m128i *)dst, src_r0); 179 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 180 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 181 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 182 183 // Row 4-7 184 src += 4 * src_wd; 185 dst += 4 * dst_wd; 186 187 src_r0 = _mm_loadl_epi64((__m128i *)src); 188 src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); 189 src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); 190 src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); 191 192 _mm_storel_epi64((__m128i *)dst, src_r0); 193 _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); 194 _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); 195 _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); 196 } 197 198 /*****************************************************************************/ 199 /* */ 200 /* Function Name : impeg2_interpolate */ 201 /* */ 202 /* Description : averages the contents of buf_src1 and buf_src2 and stores*/ 203 /* result in buf_dst */ 204 /* */ 205 /* Inputs : buf_src1 - First Source */ 206 /* buf_src2 - Second Source */ 207 /* */ 208 /* Globals : None */ 209 /* */ 210 /* Processing : Avg the values from two sources and store the result in */ 211 /* destination buffer */ 212 /* */ 213 /* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */ 214 /* */ 215 /* Returns : None */ 216 /* */ 217 /* Issues : Assumes that all 3 buffers are of same size */ 218 /* */ 219 /*****************************************************************************/ 220 void impeg2_interpolate_sse42(yuv_buf_t *buf_src1, 221 yuv_buf_t *buf_src2, 222 yuv_buf_t *buf_dst, 223 UWORD32 stride) 224 { 225 UWORD8 *src1, *src2; 226 UWORD8 *dst; 227 __m128i src1_r0, src1_r1, src1_r2, src1_r3; 228 __m128i src2_r0, src2_r1, src2_r2, src2_r3; 229 230 /*******************************************************/ 231 /* interpolate Y */ 232 /*******************************************************/ 233 src1 = buf_src1->pu1_y; 234 src2 = buf_src2->pu1_y; 235 dst = buf_dst->pu1_y; 236 // Row 0-3 237 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 238 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 239 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 240 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 241 242 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 243 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 244 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 245 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 246 247 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 248 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 249 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 250 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 251 252 _mm_storeu_si128((__m128i *) dst, src1_r0); 253 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 254 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 255 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 256 257 // Row 4-7 258 src1 += 4 * 16; 259 src2 += 4 * 16; 260 dst += 4 * stride; 261 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 262 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 263 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 264 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 265 266 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 267 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 268 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 269 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 270 271 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 272 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 273 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 274 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 275 276 _mm_storeu_si128((__m128i *) dst, src1_r0); 277 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 278 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 279 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 280 281 // Row 8-11 282 src1 += 4 * 16; 283 src2 += 4 * 16; 284 dst += 4 * stride; 285 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 286 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 287 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 288 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 289 290 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 291 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 292 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 293 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 294 295 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 296 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 297 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 298 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 299 300 _mm_storeu_si128((__m128i *) dst, src1_r0); 301 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 302 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 303 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 304 305 // Row 12-15 306 src1 += 4 * 16; 307 src2 += 4 * 16; 308 dst += 4 * stride; 309 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); 310 src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); 311 src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); 312 src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); 313 314 src2_r0 = _mm_loadu_si128((__m128i *) (src2)); 315 src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); 316 src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); 317 src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); 318 319 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 320 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 321 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 322 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 323 324 _mm_storeu_si128((__m128i *) dst, src1_r0); 325 _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); 326 _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); 327 _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); 328 329 stride >>= 1; 330 331 /*******************************************************/ 332 /* interpolate U */ 333 /*******************************************************/ 334 src1 = buf_src1->pu1_u; 335 src2 = buf_src2->pu1_u; 336 dst = buf_dst->pu1_u; 337 // Row 0-3 338 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 339 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 340 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 341 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 342 343 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 344 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 345 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 346 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 347 348 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 349 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 350 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 351 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 352 353 _mm_storel_epi64((__m128i *) dst, src1_r0); 354 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 355 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 356 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 357 358 // Row 4-7 359 src1 += 4 * 8; 360 src2 += 4 * 8; 361 dst += 4 * stride; 362 363 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 364 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 365 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 366 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 367 368 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 369 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 370 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 371 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 372 373 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 374 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 375 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 376 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 377 378 _mm_storel_epi64((__m128i *) dst, src1_r0); 379 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 380 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 381 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 382 383 /*******************************************************/ 384 /* interpolate V */ 385 /*******************************************************/ 386 src1 = buf_src1->pu1_v; 387 src2 = buf_src2->pu1_v; 388 dst = buf_dst->pu1_v; 389 390 // Row 0-3 391 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 392 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 393 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 394 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 395 396 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 397 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 398 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 399 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 400 401 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 402 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 403 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 404 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 405 406 _mm_storel_epi64((__m128i *) dst, src1_r0); 407 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 408 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 409 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 410 411 // Row 4-7 412 src1 += 4 * 8; 413 src2 += 4 * 8; 414 dst += 4 * stride; 415 416 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); 417 src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); 418 src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); 419 src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); 420 421 src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); 422 src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); 423 src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); 424 src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); 425 426 src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); 427 src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); 428 src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); 429 src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); 430 431 _mm_storel_epi64((__m128i *) dst, src1_r0); 432 _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); 433 _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); 434 _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); 435 } 436 437 /*****************************************************************************/ 438 /* */ 439 /* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */ 440 /* */ 441 /* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */ 442 /* and the above block of size 8 x 8 will be placed as a */ 443 /* block from the current position of out_buf */ 444 /* */ 445 /* Inputs : ref - Reference frame from which the block will be */ 446 /* block will be extracted. */ 447 /* ref_wid - WIdth of reference frame */ 448 /* out_wid - WIdth of the output frame */ 449 /* blk_width - width of the block */ 450 /* blk_width - height of the block */ 451 /* */ 452 /* Globals : None */ 453 /* */ 454 /* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */ 455 /* the ref frame.Interpolate these four values to get the */ 456 /* value at(0.5,0.5).Repeat this to get an 8 x 8 block */ 457 /* using 9 x 9 block from reference frame */ 458 /* */ 459 /* Outputs : out - Output containing the extracted block */ 460 /* */ 461 /* Returns : None */ 462 /* */ 463 /* Issues : None */ 464 /* */ 465 /*****************************************************************************/ 466 void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out, 467 UWORD8 *ref, 468 UWORD32 ref_wid, 469 UWORD32 out_wid) 470 { 471 UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3; 472 /* P0-P3 are the pixels in the reference frame and Q is the value being */ 473 /* estimated */ 474 /* 475 P0 P1 476 Q 477 P2 P3 478 */ 479 __m128i src_r0, src_r0_1, src_r1, src_r1_1; 480 __m128i tmp0, tmp1; 481 __m128i value_2 = _mm_set1_epi16(2); 482 483 ref_p0 = ref; 484 ref_p1 = ref + 1; 485 ref_p2 = ref + ref_wid; 486 ref_p3 = ref + ref_wid + 1; 487 488 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 489 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 490 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1 491 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 492 493 src_r0 = _mm_cvtepu8_epi16(src_r0); 494 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 495 src_r1 = _mm_cvtepu8_epi16(src_r1); 496 src_r1_1 = _mm_cvtepu8_epi16(src_r1_1); 497 498 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation 499 tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation 500 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation 501 tmp0 = _mm_add_epi16(tmp0, value_2); 502 tmp0 = _mm_srli_epi16(tmp0, 2); 503 tmp0 = _mm_packus_epi16(tmp0, value_2); 504 505 _mm_storel_epi64((__m128i *)out, tmp0); 506 507 //Row 1 508 ref_p2 += ref_wid; 509 ref_p3 += ref_wid; 510 out += out_wid; 511 512 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2 513 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 514 515 src_r0 = _mm_cvtepu8_epi16(src_r0); 516 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 517 518 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation 519 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation 520 tmp1 = _mm_add_epi16(tmp1, value_2); 521 tmp1 = _mm_srli_epi16(tmp1, 2); 522 tmp1 = _mm_packus_epi16(tmp1, value_2); 523 524 _mm_storel_epi64((__m128i *)out, tmp1); 525 526 //Row 2 527 ref_p2 += ref_wid; 528 ref_p3 += ref_wid; 529 out += out_wid; 530 531 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3 532 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 533 534 src_r0 = _mm_cvtepu8_epi16(src_r0); 535 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 536 537 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation 538 539 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation 540 tmp0 = _mm_add_epi16(tmp0, value_2); 541 tmp0 = _mm_srli_epi16(tmp0, 2); 542 tmp0 = _mm_packus_epi16(tmp0, value_2); 543 544 _mm_storel_epi64((__m128i *)out, tmp0); 545 546 //Row 3 547 ref_p2 += ref_wid; 548 ref_p3 += ref_wid; 549 out += out_wid; 550 551 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4 552 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 553 554 src_r0 = _mm_cvtepu8_epi16(src_r0); 555 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 556 557 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation 558 559 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation 560 tmp1 = _mm_add_epi16(tmp1, value_2); 561 tmp1 = _mm_srli_epi16(tmp1, 2); 562 tmp1 = _mm_packus_epi16(tmp1, value_2); 563 564 _mm_storel_epi64((__m128i *)out, tmp1); 565 566 //Row 4 567 ref_p2 += ref_wid; 568 ref_p3 += ref_wid; 569 out += out_wid; 570 571 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5 572 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 573 574 src_r0 = _mm_cvtepu8_epi16(src_r0); 575 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 576 577 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation 578 579 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation 580 tmp0 = _mm_add_epi16(tmp0, value_2); 581 tmp0 = _mm_srli_epi16(tmp0, 2); 582 tmp0 = _mm_packus_epi16(tmp0, value_2); 583 584 _mm_storel_epi64((__m128i *)out, tmp0); 585 586 //Row 5 587 ref_p2 += ref_wid; 588 ref_p3 += ref_wid; 589 out += out_wid; 590 591 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6 592 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 593 594 src_r0 = _mm_cvtepu8_epi16(src_r0); 595 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 596 597 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation 598 599 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation 600 tmp1 = _mm_add_epi16(tmp1, value_2); 601 tmp1 = _mm_srli_epi16(tmp1, 2); 602 tmp1 = _mm_packus_epi16(tmp1, value_2); 603 604 _mm_storel_epi64((__m128i *)out, tmp1); 605 606 //Row 6 607 ref_p2 += ref_wid; 608 ref_p3 += ref_wid; 609 out += out_wid; 610 611 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7 612 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 613 614 src_r0 = _mm_cvtepu8_epi16(src_r0); 615 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 616 617 tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation 618 619 tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation 620 tmp0 = _mm_add_epi16(tmp0, value_2); 621 tmp0 = _mm_srli_epi16(tmp0, 2); 622 tmp0 = _mm_packus_epi16(tmp0, value_2); 623 624 _mm_storel_epi64((__m128i *)out, tmp0); 625 626 //Row 7 627 ref_p2 += ref_wid; 628 ref_p3 += ref_wid; 629 out += out_wid; 630 631 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8 632 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); 633 634 src_r0 = _mm_cvtepu8_epi16(src_r0); 635 src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); 636 637 tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation 638 639 tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation 640 tmp1 = _mm_add_epi16(tmp1, value_2); 641 tmp1 = _mm_srli_epi16(tmp1, 2); 642 tmp1 = _mm_packus_epi16(tmp1, value_2); 643 644 _mm_storel_epi64((__m128i *)out, tmp1); 645 646 return; 647 } 648 649 /*****************************************************************************/ 650 /* */ 651 /* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */ 652 /* */ 653 /* Description : Gets the buffer from (0.5,0) to (8.5,8) */ 654 /* and the above block of size 8 x 8 will be placed as a */ 655 /* block from the current position of out_buf */ 656 /* */ 657 /* Inputs : ref - Reference frame from which the block will be */ 658 /* block will be extracted. */ 659 /* ref_wid - WIdth of reference frame */ 660 /* out_wid - WIdth of the output frame */ 661 /* blk_width - width of the block */ 662 /* blk_width - height of the block */ 663 /* */ 664 /* Globals : None */ 665 /* */ 666 /* Processing : Point to the (0,0) and (1,0) position in the ref frame */ 667 /* Interpolate these two values to get the value at(0.5,0) */ 668 /* Repeat this to get an 8 x 8 block using 9 x 8 block from */ 669 /* reference frame */ 670 /* */ 671 /* Outputs : out - Output containing the extracted block */ 672 /* */ 673 /* Returns : None */ 674 /* */ 675 /* Issues : None */ 676 /* */ 677 /*****************************************************************************/ 678 void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out, 679 UWORD8 *ref, 680 UWORD32 ref_wid, 681 UWORD32 out_wid) 682 { 683 UWORD8 *ref_p0,*ref_p1; 684 __m128i src_r0, src_r0_1, src_r1, src_r1_1; 685 /* P0-P3 are the pixels in the reference frame and Q is the value being */ 686 /* estimated */ 687 /* 688 P0 Q P1 689 */ 690 691 ref_p0 = ref; 692 ref_p1 = ref + 1; 693 694 // Row 0 and 1 695 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 696 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 697 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1 698 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 699 700 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 701 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 702 703 _mm_storel_epi64((__m128i *)out, src_r0); 704 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 705 706 // Row 2 and 3 707 ref_p0 += 2*ref_wid; 708 ref_p1 += 2*ref_wid; 709 out += 2*out_wid; 710 711 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2 712 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 713 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3 714 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 715 716 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 717 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 718 719 _mm_storel_epi64((__m128i *)out, src_r0); 720 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 721 722 // Row 4 and 5 723 ref_p0 += 2*ref_wid; 724 ref_p1 += 2*ref_wid; 725 out += 2*out_wid; 726 727 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4 728 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 729 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5 730 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 731 732 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 733 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 734 735 _mm_storel_epi64((__m128i *)out, src_r0); 736 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 737 738 // Row 6 and 7 739 ref_p0 += 2*ref_wid; 740 ref_p1 += 2*ref_wid; 741 out += 2*out_wid; 742 743 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6 744 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); 745 src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7 746 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); 747 748 src_r0 = _mm_avg_epu8(src_r0, src_r0_1); 749 src_r1 = _mm_avg_epu8(src_r1, src_r1_1); 750 751 _mm_storel_epi64((__m128i *)out, src_r0); 752 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 753 754 return; 755 } 756 757 758 /*****************************************************************************/ 759 /* */ 760 /* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */ 761 /* */ 762 /* Description : Gets the buffer from (0,0.5) to (8,8.5) */ 763 /* and the above block of size 8 x 8 will be placed as a */ 764 /* block from the current position of out_buf */ 765 /* */ 766 /* Inputs : ref - Reference frame from which the block will be */ 767 /* block will be extracted. */ 768 /* ref_wid - WIdth of reference frame */ 769 /* out_wid - WIdth of the output frame */ 770 /* blk_width - width of the block */ 771 /* blk_width - height of the block */ 772 /* */ 773 /* Globals : None */ 774 /* */ 775 /* Processing : Point to the (0,0) and (0,1) position in the ref frame */ 776 /* Interpolate these two values to get the value at(0,0.5) */ 777 /* Repeat this to get an 8 x 8 block using 8 x 9 block from */ 778 /* reference frame */ 779 /* */ 780 /* Outputs : out - Output containing the extracted block */ 781 /* */ 782 /* Returns : None */ 783 /* */ 784 /* Issues : None */ 785 /* */ 786 /*****************************************************************************/ 787 void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out, 788 UWORD8 *ref, 789 UWORD32 ref_wid, 790 UWORD32 out_wid) 791 { 792 __m128i src_r0, src_r1, src_r2, temp0, temp1; 793 /* P0-P3 are the pixels in the reference frame and Q is the value being */ 794 /* estimated */ 795 /* 796 P0 797 x 798 P1 799 */ 800 src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0 801 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1 802 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2 803 temp0 = _mm_avg_epu8(src_r0, src_r1); 804 temp1 = _mm_avg_epu8(src_r1, src_r2); 805 _mm_storel_epi64((__m128i *)out, temp0); //Row 0 806 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1 807 808 ref+= 3*ref_wid; 809 out+= 2*out_wid; 810 811 src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3 812 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4 813 temp0 = _mm_avg_epu8(src_r2, src_r0); 814 temp1 = _mm_avg_epu8(src_r0, src_r1); 815 _mm_storel_epi64((__m128i *)out, temp0); //Row 2 816 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3 817 818 ref += 2*ref_wid; 819 out+= 2*out_wid; 820 821 src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5 822 src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6 823 temp0 = _mm_avg_epu8(src_r1, src_r2); 824 temp1 = _mm_avg_epu8(src_r2, src_r0); 825 _mm_storel_epi64((__m128i *)out, temp0); //Row 4 826 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5 827 828 ref += 2*ref_wid; 829 out+= 2*out_wid; 830 831 src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7 832 src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8 833 temp0 = _mm_avg_epu8(src_r0, src_r1); 834 temp1 = _mm_avg_epu8(src_r1, src_r2); 835 _mm_storel_epi64((__m128i *)out, temp0); //Row 6 836 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7 837 838 return; 839 } 840 841 /*****************************************************************************/ 842 /* */ 843 /* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */ 844 /* */ 845 /* Description : Gets the buffer from (x,y) to (x+8,y+8) */ 846 /* and the above block of size 8 x 8 will be placed as a */ 847 /* block from the current position of out_buf */ 848 /* */ 849 /* Inputs : ref - Reference frame from which the block will be */ 850 /* block will be extracted. */ 851 /* ref_wid - WIdth of reference frame */ 852 /* out_wid - WIdth of the output frame */ 853 /* blk_width - width of the block */ 854 /* blk_width - height of the block */ 855 /* */ 856 /* Globals : None */ 857 /* */ 858 /* Processing : Point to the (0,0) position in the ref frame */ 859 /* Get an 8 x 8 block from reference frame */ 860 /* */ 861 /* Outputs : out - Output containing the extracted block */ 862 /* */ 863 /* Returns : None */ 864 /* */ 865 /* Issues : None */ 866 /* */ 867 /*****************************************************************************/ 868 void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out, 869 UWORD8 *ref, 870 UWORD32 ref_wid, 871 UWORD32 out_wid) 872 { 873 __m128i src_r0, src_r1, src_r2, src_r3; 874 // Row 0-3 875 src_r0 = _mm_loadl_epi64((__m128i *)ref); 876 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); 877 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); 878 src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); 879 880 _mm_storel_epi64((__m128i *)out, src_r0); 881 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 882 _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); 883 _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); 884 885 // Row 4-7 886 ref += 4 * ref_wid; 887 out += 4 * out_wid; 888 889 src_r0 = _mm_loadl_epi64((__m128i *)ref); 890 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); 891 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); 892 src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); 893 894 _mm_storel_epi64((__m128i *)out, src_r0); 895 _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); 896 _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); 897 _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); 898 return; 899 } 900