1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_InterpolateLuma_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 ;// Function: 14 ;// omxVCM4P10_InterpolateLuma 15 ;// 16 ;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly. 17 ;// Performs quarter pel interpolation of inter luma MB. 18 ;// It's assumed that the frame is already padded when calling this function. 19 ;// Parameters: 20 ;// [in] pSrc Pointer to the source reference frame buffer 21 ;// [in] srcStep Reference frame step in byte 22 ;// [in] dstStep Destination frame step in byte. Must be multiple of roi.width 23 ;// [in] dx Fractional part of horizontal motion vector 24 ;// component in 1/4 pixel unit; valid in the range [0,3] 25 ;// [in] dy Fractional part of vertical motion vector 26 ;// component in 1/4 pixel unit; valid in the range [0,3] 27 ;// [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must 28 ;// be equal to either 4, 8, or 16. 29 ;// [out] pDst Pointer to the destination frame buffer. 30 ;// if roi.width==4, 4-byte alignment required 31 ;// if roi.width==8, 8-byte alignment required 32 ;// if roi.width==16, 16-byte alignment required 33 ;// 34 ;// Return Value: 35 ;// If the function runs without error, it returns OMX_Sts_NoErr. 36 ;// It is assued that following cases are satisfied before calling this function: 37 ;// pSrc or pDst is not NULL. 38 ;// srcStep or dstStep >= roi.width. 39 ;// dx or dy is in the range [0-3]. 40 ;// roi.width or roi.height is not out of range {4, 8, 16}. 41 ;// If roi.width is equal to 4, Dst is 4 byte aligned. 42 ;// If roi.width is equal to 8, pDst is 8 byte aligned. 43 ;// If roi.width is equal to 16, pDst is 16 byte aligned. 44 ;// srcStep and dstStep is multiple of 8. 45 ;// 46 ;// 47 48 49 INCLUDE omxtypes_s.h 50 INCLUDE armCOMM_s.h 51 52 M_VARIANTS CortexA8 53 54 EXPORT omxVCM4P10_InterpolateLuma 55 56 57 IF CortexA8 58 IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 59 IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 60 IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 61 IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 62 ENDIF 63 64 65 66 ;// Declare input registers 67 pSrc RN 0 68 srcStep RN 1 69 pDst RN 2 70 dstStep RN 3 71 iHeight RN 4 72 iWidth RN 5 73 74 ;// Declare other intermediate registers 75 idx RN 6 76 idy RN 7 77 index RN 6 78 Temp RN 12 79 pArgs RN 11 80 81 82 IF CortexA8 83 84 ;// 85 ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time. 86 ;// 87 M_ALLOC4 ppArgs, 16 88 89 ;// Function header 90 M_START omxVCM4P10_InterpolateLuma, r11, d15 91 92 pSrcBK RN 8 93 94 ;// Declare Neon registers 95 dCoeff5 DN 30.S16 96 dCoeff20 DN 31.S16 97 98 ;// Registers used for implementing Horizontal interpolation 99 dSrc0c DN 14.U8 100 dSrc1c DN 16.U8 101 dSrc2c DN 18.U8 102 dSrc3c DN 20.U8 103 dSrc0d DN 15.U8 104 dSrc1d DN 17.U8 105 dSrc2d DN 19.U8 106 dSrc3d DN 21.U8 107 dAccH0 DN 22.U8 108 dAccH1 DN 24.U8 109 dAccH2 DN 26.U8 110 dAccH3 DN 28.U8 111 dResultH0 DN 22.U32 112 dResultH1 DN 24.U32 113 dResultH2 DN 26.U32 114 dResultH3 DN 28.U32 115 116 ;// Registers used for implementing Vertical interpolation 117 dSrc0 DN 9.U8 118 dSrc1 DN 10.U8 119 dSrc2 DN 11.U8 120 dSrc3 DN 12.U8 121 dSrc4 DN 13.U8 122 dAccV0 DN 0.U8 123 dAccV1 DN 2.U8 124 dAccV2 DN 4.U8 125 dAccV3 DN 6.U8 126 dResultV0 DN 0.U32 127 dResultV1 DN 2.U32 128 dResultV2 DN 4.U32 129 dResultV3 DN 6.U32 130 131 ;// Registers used for implementing Diagonal interpolation 132 dTAcc0 DN 0.U8 133 dTAcc1 DN 2.U8 134 dTAcc2 DN 4.U8 135 dTAcc3 DN 6.U8 136 dTRes0 DN 0.32 137 dTRes1 DN 2.32 138 dTRes2 DN 4.32 139 dTRes3 DN 6.32 140 dTResult0 DN 14.U8 141 dTResult1 DN 16.U8 142 dTResult2 DN 18.U8 143 dTResult3 DN 20.U8 144 dTempP0 DN 18.S16 145 dTempP1 DN 19.S16 146 dTempQ0 DN 20.S16 147 dTempQ1 DN 21.S16 148 dTempR0 DN 22.S16 149 dTempR1 DN 23.S16 150 dTempS0 DN 24.S16 151 dTempS1 DN 25.S16 152 qTempP01 QN 9.S16 153 qTempQ01 QN 10.S16 154 qTempR01 QN 11.S16 155 qTempS01 QN 12.S16 156 157 ;// Intermediate values for averaging 158 qRes2 QN 7.S16 159 qRes3 QN 8.S16 160 qRes4 QN 9.S16 161 qRes5 QN 10.S16 162 qRes6 QN 11.S16 163 164 ;// For implementing copy 165 dDst0 DN 9.32 166 dDst1 DN 10.32 167 dDst2 DN 11.32 168 dDst3 DN 12.32 169 170 ;// Define stack arguments 171 M_ARG ptridx, 4 172 M_ARG ptridy, 4 173 M_ARG ptrWidth, 4 174 M_ARG ptrHeight, 4 175 176 ;// Load structure elements of roi 177 M_LDR idx, ptridx 178 M_LDR idy, ptridy 179 M_LDR iWidth, ptrWidth 180 M_LDR iHeight, ptrHeight 181 182 ADD index, idx, idy, LSL #2 ;// [index] = [idy][idx] 183 M_ADR pArgs, ppArgs 184 185 ;// Move coefficients Neon registers 186 VMOV dCoeff20, #20 187 VMOV dCoeff5, #5 188 189 Block4x4WidthLoop 190 Block4x4HeightLoop 191 192 STM pArgs, {pSrc,srcStep,pDst,dstStep} 193 194 ;// switch table using motion vector as index 195 ADD pc, pc, index, LSL #2 196 B Case_f 197 B Case_0 198 B Case_1 199 B Case_2 200 B Case_3 201 B Case_4 202 B Case_5 203 B Case_6 204 B Case_7 205 B Case_8 206 B Case_9 207 B Case_a 208 B Case_b 209 B Case_c 210 B Case_d 211 B Case_e 212 B Case_f 213 214 Case_0 215 ;// Case G 216 M_PRINTF "Case 0 \n" 217 218 ;// Loads a 4x4 block of .8 and stores as .32 219 ADD Temp, pSrc, srcStep, LSL #1 220 VLD1 dSrc0, [pSrc], srcStep 221 VLD1 dSrc2, [Temp], srcStep 222 VLD1 dSrc1, [pSrc] 223 VLD1 dSrc3, [Temp] 224 225 ADD Temp, pDst, dstStep, LSL #1 226 VST1 dDst0[0], [pDst], dstStep 227 VST1 dDst2[0], [Temp], dstStep 228 VST1 dDst1[0], [pDst] 229 VST1 dDst3[0], [Temp] 230 M_ADR pArgs, ppArgs 231 B Block4x4LoopEnd 232 Case_1 233 ;// Case a 234 M_PRINTF "Case 1 \n" 235 236 SUB pSrc, pSrc, #2 237 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 238 VRHADD dAccH0, dAccH0, dSrc0c 239 VRHADD dAccH2, dAccH2, dSrc2c 240 VRHADD dAccH1, dAccH1, dSrc1c 241 VRHADD dAccH3, dAccH3, dSrc3c 242 ADD Temp, pDst, dstStep, LSL #1 243 VST1 dResultH0[0], [pDst], dstStep 244 VST1 dResultH2[0], [Temp], dstStep 245 VST1 dResultH1[0], [pDst] 246 VST1 dResultH3[0], [Temp] 247 M_ADR pArgs, ppArgs 248 B Block4x4LoopEnd 249 Case_2 250 ;// Case b 251 M_PRINTF "Case 2 \n" 252 253 SUB pSrc, pSrc, #2 254 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 255 ADD Temp, pDst, dstStep, LSL #1 256 VST1 dResultH0[0], [pDst], dstStep 257 VST1 dResultH2[0], [Temp], dstStep 258 VST1 dResultH1[0], [pDst] 259 VST1 dResultH3[0], [Temp] 260 M_ADR pArgs, ppArgs 261 B Block4x4LoopEnd 262 Case_3 263 ;// Case c 264 M_PRINTF "Case 3 \n" 265 266 SUB pSrc, pSrc, #2 267 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 268 VRHADD dAccH0, dAccH0, dSrc0d 269 VRHADD dAccH2, dAccH2, dSrc2d 270 VRHADD dAccH1, dAccH1, dSrc1d 271 VRHADD dAccH3, dAccH3, dSrc3d 272 ADD Temp, pDst, dstStep, LSL #1 273 VST1 dResultH0[0], [pDst], dstStep 274 VST1 dResultH2[0], [Temp], dstStep 275 VST1 dResultH1[0], [pDst] 276 VST1 dResultH3[0], [Temp] 277 M_ADR pArgs, ppArgs 278 B Block4x4LoopEnd 279 Case_4 280 ;// Case d 281 M_PRINTF "Case 4 \n" 282 283 SUB pSrc, pSrc, srcStep, LSL #1 284 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 285 VRHADD dAccV0, dAccV0, dSrc0 286 VRHADD dAccV2, dAccV2, dSrc2 287 VRHADD dAccV1, dAccV1, dSrc1 288 VRHADD dAccV3, dAccV3, dSrc3 289 ADD Temp, pDst, dstStep, LSL #1 290 VST1 dResultV0[0], [pDst], dstStep 291 VST1 dResultV2[0], [Temp], dstStep 292 VST1 dResultV1[0], [pDst] 293 VST1 dResultV3[0], [Temp] 294 M_ADR pArgs, ppArgs 295 B Block4x4LoopEnd 296 Case_5 297 ;// Case e 298 M_PRINTF "Case 5 \n" 299 300 MOV pSrcBK, pSrc 301 SUB pSrc, pSrc, srcStep, LSL #1 302 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 303 SUB pSrc, pSrcBK, #2 304 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 305 VRHADD dAccH0, dAccH0, dAccV0 306 VRHADD dAccH2, dAccH2, dAccV2 307 VRHADD dAccH1, dAccH1, dAccV1 308 VRHADD dAccH3, dAccH3, dAccV3 309 ADD Temp, pDst, dstStep, LSL #1 310 VST1 dResultH0[0], [pDst], dstStep 311 VST1 dResultH2[0], [Temp], dstStep 312 VST1 dResultH1[0], [pDst] 313 VST1 dResultH3[0], [Temp] 314 315 M_ADR pArgs, ppArgs 316 B Block4x4LoopEnd 317 Case_6 318 ;// Case f 319 M_PRINTF "Case 6 \n" 320 321 SUB pSrc, pSrc, srcStep, LSL #1 322 SUB pSrc, pSrc, #2 323 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 324 VQRSHRUN dTResult0, qRes2, #5 325 VQRSHRUN dTResult1, qRes3, #5 326 VQRSHRUN dTResult2, qRes4, #5 327 VQRSHRUN dTResult3, qRes5, #5 328 VRHADD dTAcc0, dTAcc0, dTResult0 329 VRHADD dTAcc2, dTAcc2, dTResult2 330 VRHADD dTAcc1, dTAcc1, dTResult1 331 VRHADD dTAcc3, dTAcc3, dTResult3 332 ADD Temp, pDst, dstStep, LSL #1 333 VST1 dTRes0[0], [pDst], dstStep 334 VST1 dTRes2[0], [Temp], dstStep 335 VST1 dTRes1[0], [pDst] 336 VST1 dTRes3[0], [Temp] 337 338 M_ADR pArgs, ppArgs 339 B Block4x4LoopEnd 340 Case_7 341 ;// Case g 342 M_PRINTF "Case 7 \n" 343 MOV pSrcBK, pSrc 344 ADD pSrc, pSrc, #1 345 SUB pSrc, pSrc, srcStep, LSL #1 346 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 347 SUB pSrc, pSrcBK, #2 348 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 349 VRHADD dAccH0, dAccH0, dAccV0 350 VRHADD dAccH2, dAccH2, dAccV2 351 VRHADD dAccH1, dAccH1, dAccV1 352 VRHADD dAccH3, dAccH3, dAccV3 353 ADD Temp, pDst, dstStep, LSL #1 354 VST1 dResultH0[0], [pDst], dstStep 355 VST1 dResultH2[0], [Temp], dstStep 356 VST1 dResultH1[0], [pDst] 357 VST1 dResultH3[0], [Temp] 358 359 M_ADR pArgs, ppArgs 360 B Block4x4LoopEnd 361 Case_8 362 ;// Case h 363 M_PRINTF "Case 8 \n" 364 365 SUB pSrc, pSrc, srcStep, LSL #1 366 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 367 ADD Temp, pDst, dstStep, LSL #1 368 VST1 dResultV0[0], [pDst], dstStep 369 VST1 dResultV2[0], [Temp], dstStep 370 VST1 dResultV1[0], [pDst] 371 VST1 dResultV3[0], [Temp] 372 M_ADR pArgs, ppArgs 373 B Block4x4LoopEnd 374 Case_9 375 ;// Case i 376 M_PRINTF "Case 9 \n" 377 SUB pSrc, pSrc, srcStep, LSL #1 378 SUB pSrc, pSrc, #2 379 BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 380 VEXT dTempP0, dTempP0, dTempP1, #2 381 VEXT dTempQ0, dTempQ0, dTempQ1, #2 382 VEXT dTempR0, dTempR0, dTempR1, #2 383 VEXT dTempS0, dTempS0, dTempS1, #2 384 385 VQRSHRUN dTResult0, qTempP01, #5 386 VQRSHRUN dTResult1, qTempQ01, #5 387 VQRSHRUN dTResult2, qTempR01, #5 388 VQRSHRUN dTResult3, qTempS01, #5 389 390 VRHADD dTAcc0, dTAcc0, dTResult0 391 VRHADD dTAcc2, dTAcc2, dTResult2 392 VRHADD dTAcc1, dTAcc1, dTResult1 393 VRHADD dTAcc3, dTAcc3, dTResult3 394 ADD Temp, pDst, dstStep, LSL #1 395 VST1 dTRes0[0], [pDst], dstStep 396 VST1 dTRes2[0], [Temp], dstStep 397 VST1 dTRes1[0], [pDst] 398 VST1 dTRes3[0], [Temp] 399 M_ADR pArgs, ppArgs 400 B Block4x4LoopEnd 401 Case_a 402 ;// Case j 403 M_PRINTF "Case a \n" 404 405 SUB pSrc, pSrc, srcStep, LSL #1 406 SUB pSrc, pSrc, #2 407 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 408 ADD Temp, pDst, dstStep, LSL #1 409 VST1 dTRes0[0], [pDst], dstStep 410 VST1 dTRes2[0], [Temp], dstStep 411 VST1 dTRes1[0], [pDst] 412 VST1 dTRes3[0], [Temp] 413 M_ADR pArgs, ppArgs 414 B Block4x4LoopEnd 415 Case_b 416 ;// Case k 417 M_PRINTF "Case b \n" 418 SUB pSrc, pSrc, srcStep, LSL #1 419 SUB pSrc, pSrc, #2 420 BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 421 VEXT dTempP0, dTempP0, dTempP1, #3 422 VEXT dTempQ0, dTempQ0, dTempQ1, #3 423 VEXT dTempR0, dTempR0, dTempR1, #3 424 VEXT dTempS0, dTempS0, dTempS1, #3 425 426 VQRSHRUN dTResult0, qTempP01, #5 427 VQRSHRUN dTResult1, qTempQ01, #5 428 VQRSHRUN dTResult2, qTempR01, #5 429 VQRSHRUN dTResult3, qTempS01, #5 430 431 VRHADD dTAcc0, dTAcc0, dTResult0 432 VRHADD dTAcc2, dTAcc2, dTResult2 433 VRHADD dTAcc1, dTAcc1, dTResult1 434 VRHADD dTAcc3, dTAcc3, dTResult3 435 ADD Temp, pDst, dstStep, LSL #1 436 VST1 dTRes0[0], [pDst], dstStep 437 VST1 dTRes2[0], [Temp], dstStep 438 VST1 dTRes1[0], [pDst] 439 VST1 dTRes3[0], [Temp] 440 M_ADR pArgs, ppArgs 441 B Block4x4LoopEnd 442 Case_c 443 ;// Case n 444 M_PRINTF "Case c \n" 445 446 SUB pSrc, pSrc, srcStep, LSL #1 447 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 448 VRHADD dAccV0, dAccV0, dSrc1 449 VRHADD dAccV2, dAccV2, dSrc3 450 VRHADD dAccV1, dAccV1, dSrc2 451 VRHADD dAccV3, dAccV3, dSrc4 452 ADD Temp, pDst, dstStep, LSL #1 453 VST1 dResultV0[0], [pDst], dstStep 454 VST1 dResultV2[0], [Temp], dstStep 455 VST1 dResultV1[0], [pDst] 456 VST1 dResultV3[0], [Temp] 457 M_ADR pArgs, ppArgs 458 B Block4x4LoopEnd 459 Case_d 460 ;// Case p 461 M_PRINTF "Case d \n" 462 463 MOV pSrcBK, pSrc 464 SUB pSrc, pSrc, srcStep, LSL #1 465 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 466 ADD pSrc, pSrcBK, srcStep 467 SUB pSrc, pSrc, #2 468 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 469 VRHADD dAccH0, dAccH0, dAccV0 470 VRHADD dAccH2, dAccH2, dAccV2 471 VRHADD dAccH1, dAccH1, dAccV1 472 VRHADD dAccH3, dAccH3, dAccV3 473 ADD Temp, pDst, dstStep, LSL #1 474 VST1 dResultH0[0], [pDst], dstStep 475 VST1 dResultH2[0], [Temp], dstStep 476 VST1 dResultH1[0], [pDst] 477 VST1 dResultH3[0], [Temp] 478 M_ADR pArgs, ppArgs 479 B Block4x4LoopEnd 480 Case_e 481 ;// Case q 482 M_PRINTF "Case e \n" 483 484 SUB pSrc, pSrc, srcStep, LSL #1 485 SUB pSrc, pSrc, #2 486 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 487 VQRSHRUN dTResult0, qRes3, #5 488 VQRSHRUN dTResult1, qRes4, #5 489 VQRSHRUN dTResult2, qRes5, #5 490 VQRSHRUN dTResult3, qRes6, #5 491 492 VRHADD dTAcc0, dTAcc0, dTResult0 493 VRHADD dTAcc2, dTAcc2, dTResult2 494 VRHADD dTAcc1, dTAcc1, dTResult1 495 VRHADD dTAcc3, dTAcc3, dTResult3 496 ADD Temp, pDst, dstStep, LSL #1 497 VST1 dTRes0[0], [pDst], dstStep 498 VST1 dTRes2[0], [Temp], dstStep 499 VST1 dTRes1[0], [pDst] 500 VST1 dTRes3[0], [Temp] 501 M_ADR pArgs, ppArgs 502 B Block4x4LoopEnd 503 Case_f 504 ;// Case r 505 M_PRINTF "Case f \n" 506 MOV pSrcBK, pSrc 507 ADD pSrc, pSrc, #1 508 SUB pSrc, pSrc, srcStep, LSL #1 509 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 510 ADD pSrc, pSrcBK, srcStep 511 SUB pSrc, pSrc, #2 512 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 513 VRHADD dAccH0, dAccH0, dAccV0 514 VRHADD dAccH2, dAccH2, dAccV2 515 VRHADD dAccH1, dAccH1, dAccV1 516 VRHADD dAccH3, dAccH3, dAccV3 517 ADD Temp, pDst, dstStep, LSL #1 518 VST1 dResultH0[0], [pDst], dstStep 519 VST1 dResultH2[0], [Temp], dstStep 520 VST1 dResultH1[0], [pDst] 521 VST1 dResultH3[0], [Temp] 522 M_ADR pArgs, ppArgs 523 524 525 Block4x4LoopEnd 526 527 ;// Width Loop 528 ;//M_ADR pArgs, ppArgs 529 LDM pArgs, {pSrc,srcStep,pDst,dstStep} ;// Load arguments 530 SUBS iWidth, iWidth, #4 531 ADD pSrc, pSrc, #4 532 ADD pDst, pDst, #4 533 BGT Block4x4WidthLoop 534 535 ;// Height Loop 536 SUBS iHeight, iHeight, #4 537 M_LDR iWidth, ptrWidth 538 M_ADR pArgs, ppArgs 539 ADD pSrc, pSrc, srcStep, LSL #2 540 ADD pDst, pDst, dstStep, LSL #2 541 SUB pSrc, pSrc, iWidth 542 SUB pDst, pDst, iWidth 543 BGT Block4x4HeightLoop 544 545 EndOfInterpolation 546 MOV r0, #0 547 M_END 548 549 ENDIF 550 ;// End of CortexA8 551 552 END 553 554