1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_InterpolateLuma_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 ;// Function: 28 ;// omxVCM4P10_InterpolateLuma 29 ;// 30 ;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly. 31 ;// Performs quarter pel interpolation of inter luma MB. 32 ;// It's assumed that the frame is already padded when calling this function. 33 ;// Parameters: 34 ;// [in] pSrc Pointer to the source reference frame buffer 35 ;// [in] srcStep Reference frame step in byte 36 ;// [in] dstStep Destination frame step in byte. Must be multiple of roi.width 37 ;// [in] dx Fractional part of horizontal motion vector 38 ;// component in 1/4 pixel unit; valid in the range [0,3] 39 ;// [in] dy Fractional part of vertical motion vector 40 ;// component in 1/4 pixel unit; valid in the range [0,3] 41 ;// [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must 42 ;// be equal to either 4, 8, or 16. 43 ;// [out] pDst Pointer to the destination frame buffer. 44 ;// if roi.width==4, 4-byte alignment required 45 ;// if roi.width==8, 8-byte alignment required 46 ;// if roi.width==16, 16-byte alignment required 47 ;// 48 ;// Return Value: 49 ;// If the function runs without error, it returns OMX_Sts_NoErr. 50 ;// It is assued that following cases are satisfied before calling this function: 51 ;// pSrc or pDst is not NULL. 52 ;// srcStep or dstStep >= roi.width. 53 ;// dx or dy is in the range [0-3]. 54 ;// roi.width or roi.height is not out of range {4, 8, 16}. 55 ;// If roi.width is equal to 4, Dst is 4 byte aligned. 56 ;// If roi.width is equal to 8, pDst is 8 byte aligned. 57 ;// If roi.width is equal to 16, pDst is 16 byte aligned. 58 ;// srcStep and dstStep is multiple of 8. 59 ;// 60 ;// 61 62 63 INCLUDE omxtypes_s.h 64 INCLUDE armCOMM_s.h 65 66 M_VARIANTS CortexA8 67 68 EXPORT omxVCM4P10_InterpolateLuma 69 70 71 IF CortexA8 72 IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 73 IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 74 IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 75 IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 76 ENDIF 77 78 79 80 ;// Declare input registers 81 pSrc RN 0 82 srcStep RN 1 83 pDst RN 2 84 dstStep RN 3 85 iHeight RN 4 86 iWidth RN 5 87 88 ;// Declare other intermediate registers 89 idx RN 6 90 idy RN 7 91 index RN 6 92 Temp RN 12 93 pArgs RN 11 94 95 96 IF CortexA8 97 98 ;// 99 ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time. 100 ;// 101 M_ALLOC4 ppArgs, 16 102 103 ;// Function header 104 M_START omxVCM4P10_InterpolateLuma, r11, d15 105 106 pSrcBK RN 8 107 108 ;// Declare Neon registers 109 dCoeff5 DN 30.S16 110 dCoeff20 DN 31.S16 111 112 ;// Registers used for implementing Horizontal interpolation 113 dSrc0c DN 14.U8 114 dSrc1c DN 16.U8 115 dSrc2c DN 18.U8 116 dSrc3c DN 20.U8 117 dSrc0d DN 15.U8 118 dSrc1d DN 17.U8 119 dSrc2d DN 19.U8 120 dSrc3d DN 21.U8 121 dAccH0 DN 22.U8 122 dAccH1 DN 24.U8 123 dAccH2 DN 26.U8 124 dAccH3 DN 28.U8 125 dResultH0 DN 22.U32 126 dResultH1 DN 24.U32 127 dResultH2 DN 26.U32 128 dResultH3 DN 28.U32 129 130 ;// Registers used for implementing Vertical interpolation 131 dSrc0 DN 9.U8 132 dSrc1 DN 10.U8 133 dSrc2 DN 11.U8 134 dSrc3 DN 12.U8 135 dSrc4 DN 13.U8 136 dAccV0 DN 0.U8 137 dAccV1 DN 2.U8 138 dAccV2 DN 4.U8 139 dAccV3 DN 6.U8 140 dResultV0 DN 0.U32 141 dResultV1 DN 2.U32 142 dResultV2 DN 4.U32 143 dResultV3 DN 6.U32 144 145 ;// Registers used for implementing Diagonal interpolation 146 dTAcc0 DN 0.U8 147 dTAcc1 DN 2.U8 148 dTAcc2 DN 4.U8 149 dTAcc3 DN 6.U8 150 dTRes0 DN 0.32 151 dTRes1 DN 2.32 152 dTRes2 DN 4.32 153 dTRes3 DN 6.32 154 dTResult0 DN 14.U8 155 dTResult1 DN 16.U8 156 dTResult2 DN 18.U8 157 dTResult3 DN 20.U8 158 dTempP0 DN 18.S16 159 dTempP1 DN 19.S16 160 dTempQ0 DN 20.S16 161 dTempQ1 DN 21.S16 162 dTempR0 DN 22.S16 163 dTempR1 DN 23.S16 164 dTempS0 DN 24.S16 165 dTempS1 DN 25.S16 166 qTempP01 QN 9.S16 167 qTempQ01 QN 10.S16 168 qTempR01 QN 11.S16 169 qTempS01 QN 12.S16 170 171 ;// Intermediate values for averaging 172 qRes2 QN 7.S16 173 qRes3 QN 8.S16 174 qRes4 QN 9.S16 175 qRes5 QN 10.S16 176 qRes6 QN 11.S16 177 178 ;// For implementing copy 179 dDst0 DN 9.32 180 dDst1 DN 10.32 181 dDst2 DN 11.32 182 dDst3 DN 12.32 183 184 ;// Define stack arguments 185 M_ARG ptridx, 4 186 M_ARG ptridy, 4 187 M_ARG ptrWidth, 4 188 M_ARG ptrHeight, 4 189 190 ;// Load structure elements of roi 191 M_LDR idx, ptridx 192 M_LDR idy, ptridy 193 M_LDR iWidth, ptrWidth 194 M_LDR iHeight, ptrHeight 195 196 ADD index, idx, idy, LSL #2 ;// [index] = [idy][idx] 197 M_ADR pArgs, ppArgs 198 199 ;// Move coefficients Neon registers 200 VMOV dCoeff20, #20 201 VMOV dCoeff5, #5 202 203 Block4x4WidthLoop 204 Block4x4HeightLoop 205 206 STM pArgs, {pSrc,srcStep,pDst,dstStep} 207 208 ;// switch table using motion vector as index 209 ADD pc, pc, index, LSL #2 210 B Case_f 211 B Case_0 212 B Case_1 213 B Case_2 214 B Case_3 215 B Case_4 216 B Case_5 217 B Case_6 218 B Case_7 219 B Case_8 220 B Case_9 221 B Case_a 222 B Case_b 223 B Case_c 224 B Case_d 225 B Case_e 226 B Case_f 227 228 Case_0 229 ;// Case G 230 M_PRINTF "Case 0 \n" 231 232 ;// Loads a 4x4 block of .8 and stores as .32 233 ADD Temp, pSrc, srcStep, LSL #1 234 VLD1 dSrc0, [pSrc], srcStep 235 VLD1 dSrc2, [Temp], srcStep 236 VLD1 dSrc1, [pSrc] 237 VLD1 dSrc3, [Temp] 238 239 ADD Temp, pDst, dstStep, LSL #1 240 VST1 dDst0[0], [pDst], dstStep 241 VST1 dDst2[0], [Temp], dstStep 242 VST1 dDst1[0], [pDst] 243 VST1 dDst3[0], [Temp] 244 M_ADR pArgs, ppArgs 245 B Block4x4LoopEnd 246 Case_1 247 ;// Case a 248 M_PRINTF "Case 1 \n" 249 250 SUB pSrc, pSrc, #2 251 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 252 VRHADD dAccH0, dAccH0, dSrc0c 253 VRHADD dAccH2, dAccH2, dSrc2c 254 VRHADD dAccH1, dAccH1, dSrc1c 255 VRHADD dAccH3, dAccH3, dSrc3c 256 ADD Temp, pDst, dstStep, LSL #1 257 VST1 dResultH0[0], [pDst], dstStep 258 VST1 dResultH2[0], [Temp], dstStep 259 VST1 dResultH1[0], [pDst] 260 VST1 dResultH3[0], [Temp] 261 M_ADR pArgs, ppArgs 262 B Block4x4LoopEnd 263 Case_2 264 ;// Case b 265 M_PRINTF "Case 2 \n" 266 267 SUB pSrc, pSrc, #2 268 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 269 ADD Temp, pDst, dstStep, LSL #1 270 VST1 dResultH0[0], [pDst], dstStep 271 VST1 dResultH2[0], [Temp], dstStep 272 VST1 dResultH1[0], [pDst] 273 VST1 dResultH3[0], [Temp] 274 M_ADR pArgs, ppArgs 275 B Block4x4LoopEnd 276 Case_3 277 ;// Case c 278 M_PRINTF "Case 3 \n" 279 280 SUB pSrc, pSrc, #2 281 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 282 VRHADD dAccH0, dAccH0, dSrc0d 283 VRHADD dAccH2, dAccH2, dSrc2d 284 VRHADD dAccH1, dAccH1, dSrc1d 285 VRHADD dAccH3, dAccH3, dSrc3d 286 ADD Temp, pDst, dstStep, LSL #1 287 VST1 dResultH0[0], [pDst], dstStep 288 VST1 dResultH2[0], [Temp], dstStep 289 VST1 dResultH1[0], [pDst] 290 VST1 dResultH3[0], [Temp] 291 M_ADR pArgs, ppArgs 292 B Block4x4LoopEnd 293 Case_4 294 ;// Case d 295 M_PRINTF "Case 4 \n" 296 297 SUB pSrc, pSrc, srcStep, LSL #1 298 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 299 VRHADD dAccV0, dAccV0, dSrc0 300 VRHADD dAccV2, dAccV2, dSrc2 301 VRHADD dAccV1, dAccV1, dSrc1 302 VRHADD dAccV3, dAccV3, dSrc3 303 ADD Temp, pDst, dstStep, LSL #1 304 VST1 dResultV0[0], [pDst], dstStep 305 VST1 dResultV2[0], [Temp], dstStep 306 VST1 dResultV1[0], [pDst] 307 VST1 dResultV3[0], [Temp] 308 M_ADR pArgs, ppArgs 309 B Block4x4LoopEnd 310 Case_5 311 ;// Case e 312 M_PRINTF "Case 5 \n" 313 314 MOV pSrcBK, pSrc 315 SUB pSrc, pSrc, srcStep, LSL #1 316 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 317 SUB pSrc, pSrcBK, #2 318 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 319 VRHADD dAccH0, dAccH0, dAccV0 320 VRHADD dAccH2, dAccH2, dAccV2 321 VRHADD dAccH1, dAccH1, dAccV1 322 VRHADD dAccH3, dAccH3, dAccV3 323 ADD Temp, pDst, dstStep, LSL #1 324 VST1 dResultH0[0], [pDst], dstStep 325 VST1 dResultH2[0], [Temp], dstStep 326 VST1 dResultH1[0], [pDst] 327 VST1 dResultH3[0], [Temp] 328 329 M_ADR pArgs, ppArgs 330 B Block4x4LoopEnd 331 Case_6 332 ;// Case f 333 M_PRINTF "Case 6 \n" 334 335 SUB pSrc, pSrc, srcStep, LSL #1 336 SUB pSrc, pSrc, #2 337 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 338 VQRSHRUN dTResult0, qRes2, #5 339 VQRSHRUN dTResult1, qRes3, #5 340 VQRSHRUN dTResult2, qRes4, #5 341 VQRSHRUN dTResult3, qRes5, #5 342 VRHADD dTAcc0, dTAcc0, dTResult0 343 VRHADD dTAcc2, dTAcc2, dTResult2 344 VRHADD dTAcc1, dTAcc1, dTResult1 345 VRHADD dTAcc3, dTAcc3, dTResult3 346 ADD Temp, pDst, dstStep, LSL #1 347 VST1 dTRes0[0], [pDst], dstStep 348 VST1 dTRes2[0], [Temp], dstStep 349 VST1 dTRes1[0], [pDst] 350 VST1 dTRes3[0], [Temp] 351 352 M_ADR pArgs, ppArgs 353 B Block4x4LoopEnd 354 Case_7 355 ;// Case g 356 M_PRINTF "Case 7 \n" 357 MOV pSrcBK, pSrc 358 ADD pSrc, pSrc, #1 359 SUB pSrc, pSrc, srcStep, LSL #1 360 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 361 SUB pSrc, pSrcBK, #2 362 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 363 VRHADD dAccH0, dAccH0, dAccV0 364 VRHADD dAccH2, dAccH2, dAccV2 365 VRHADD dAccH1, dAccH1, dAccV1 366 VRHADD dAccH3, dAccH3, dAccV3 367 ADD Temp, pDst, dstStep, LSL #1 368 VST1 dResultH0[0], [pDst], dstStep 369 VST1 dResultH2[0], [Temp], dstStep 370 VST1 dResultH1[0], [pDst] 371 VST1 dResultH3[0], [Temp] 372 373 M_ADR pArgs, ppArgs 374 B Block4x4LoopEnd 375 Case_8 376 ;// Case h 377 M_PRINTF "Case 8 \n" 378 379 SUB pSrc, pSrc, srcStep, LSL #1 380 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 381 ADD Temp, pDst, dstStep, LSL #1 382 VST1 dResultV0[0], [pDst], dstStep 383 VST1 dResultV2[0], [Temp], dstStep 384 VST1 dResultV1[0], [pDst] 385 VST1 dResultV3[0], [Temp] 386 M_ADR pArgs, ppArgs 387 B Block4x4LoopEnd 388 Case_9 389 ;// Case i 390 M_PRINTF "Case 9 \n" 391 SUB pSrc, pSrc, srcStep, LSL #1 392 SUB pSrc, pSrc, #2 393 BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 394 VEXT dTempP0, dTempP0, dTempP1, #2 395 VEXT dTempQ0, dTempQ0, dTempQ1, #2 396 VEXT dTempR0, dTempR0, dTempR1, #2 397 VEXT dTempS0, dTempS0, dTempS1, #2 398 399 VQRSHRUN dTResult0, qTempP01, #5 400 VQRSHRUN dTResult1, qTempQ01, #5 401 VQRSHRUN dTResult2, qTempR01, #5 402 VQRSHRUN dTResult3, qTempS01, #5 403 404 VRHADD dTAcc0, dTAcc0, dTResult0 405 VRHADD dTAcc2, dTAcc2, dTResult2 406 VRHADD dTAcc1, dTAcc1, dTResult1 407 VRHADD dTAcc3, dTAcc3, dTResult3 408 ADD Temp, pDst, dstStep, LSL #1 409 VST1 dTRes0[0], [pDst], dstStep 410 VST1 dTRes2[0], [Temp], dstStep 411 VST1 dTRes1[0], [pDst] 412 VST1 dTRes3[0], [Temp] 413 M_ADR pArgs, ppArgs 414 B Block4x4LoopEnd 415 Case_a 416 ;// Case j 417 M_PRINTF "Case a \n" 418 419 SUB pSrc, pSrc, srcStep, LSL #1 420 SUB pSrc, pSrc, #2 421 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 422 ADD Temp, pDst, dstStep, LSL #1 423 VST1 dTRes0[0], [pDst], dstStep 424 VST1 dTRes2[0], [Temp], dstStep 425 VST1 dTRes1[0], [pDst] 426 VST1 dTRes3[0], [Temp] 427 M_ADR pArgs, ppArgs 428 B Block4x4LoopEnd 429 Case_b 430 ;// Case k 431 M_PRINTF "Case b \n" 432 SUB pSrc, pSrc, srcStep, LSL #1 433 SUB pSrc, pSrc, #2 434 BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 435 VEXT dTempP0, dTempP0, dTempP1, #3 436 VEXT dTempQ0, dTempQ0, dTempQ1, #3 437 VEXT dTempR0, dTempR0, dTempR1, #3 438 VEXT dTempS0, dTempS0, dTempS1, #3 439 440 VQRSHRUN dTResult0, qTempP01, #5 441 VQRSHRUN dTResult1, qTempQ01, #5 442 VQRSHRUN dTResult2, qTempR01, #5 443 VQRSHRUN dTResult3, qTempS01, #5 444 445 VRHADD dTAcc0, dTAcc0, dTResult0 446 VRHADD dTAcc2, dTAcc2, dTResult2 447 VRHADD dTAcc1, dTAcc1, dTResult1 448 VRHADD dTAcc3, dTAcc3, dTResult3 449 ADD Temp, pDst, dstStep, LSL #1 450 VST1 dTRes0[0], [pDst], dstStep 451 VST1 dTRes2[0], [Temp], dstStep 452 VST1 dTRes1[0], [pDst] 453 VST1 dTRes3[0], [Temp] 454 M_ADR pArgs, ppArgs 455 B Block4x4LoopEnd 456 Case_c 457 ;// Case n 458 M_PRINTF "Case c \n" 459 460 SUB pSrc, pSrc, srcStep, LSL #1 461 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 462 VRHADD dAccV0, dAccV0, dSrc1 463 VRHADD dAccV2, dAccV2, dSrc3 464 VRHADD dAccV1, dAccV1, dSrc2 465 VRHADD dAccV3, dAccV3, dSrc4 466 ADD Temp, pDst, dstStep, LSL #1 467 VST1 dResultV0[0], [pDst], dstStep 468 VST1 dResultV2[0], [Temp], dstStep 469 VST1 dResultV1[0], [pDst] 470 VST1 dResultV3[0], [Temp] 471 M_ADR pArgs, ppArgs 472 B Block4x4LoopEnd 473 Case_d 474 ;// Case p 475 M_PRINTF "Case d \n" 476 477 MOV pSrcBK, pSrc 478 SUB pSrc, pSrc, srcStep, LSL #1 479 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 480 ADD pSrc, pSrcBK, srcStep 481 SUB pSrc, pSrc, #2 482 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 483 VRHADD dAccH0, dAccH0, dAccV0 484 VRHADD dAccH2, dAccH2, dAccV2 485 VRHADD dAccH1, dAccH1, dAccV1 486 VRHADD dAccH3, dAccH3, dAccV3 487 ADD Temp, pDst, dstStep, LSL #1 488 VST1 dResultH0[0], [pDst], dstStep 489 VST1 dResultH2[0], [Temp], dstStep 490 VST1 dResultH1[0], [pDst] 491 VST1 dResultH3[0], [Temp] 492 M_ADR pArgs, ppArgs 493 B Block4x4LoopEnd 494 Case_e 495 ;// Case q 496 M_PRINTF "Case e \n" 497 498 SUB pSrc, pSrc, srcStep, LSL #1 499 SUB pSrc, pSrc, #2 500 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 501 VQRSHRUN dTResult0, qRes3, #5 502 VQRSHRUN dTResult1, qRes4, #5 503 VQRSHRUN dTResult2, qRes5, #5 504 VQRSHRUN dTResult3, qRes6, #5 505 506 VRHADD dTAcc0, dTAcc0, dTResult0 507 VRHADD dTAcc2, dTAcc2, dTResult2 508 VRHADD dTAcc1, dTAcc1, dTResult1 509 VRHADD dTAcc3, dTAcc3, dTResult3 510 ADD Temp, pDst, dstStep, LSL #1 511 VST1 dTRes0[0], [pDst], dstStep 512 VST1 dTRes2[0], [Temp], dstStep 513 VST1 dTRes1[0], [pDst] 514 VST1 dTRes3[0], [Temp] 515 M_ADR pArgs, ppArgs 516 B Block4x4LoopEnd 517 Case_f 518 ;// Case r 519 M_PRINTF "Case f \n" 520 MOV pSrcBK, pSrc 521 ADD pSrc, pSrc, #1 522 SUB pSrc, pSrc, srcStep, LSL #1 523 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 524 ADD pSrc, pSrcBK, srcStep 525 SUB pSrc, pSrc, #2 526 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 527 VRHADD dAccH0, dAccH0, dAccV0 528 VRHADD dAccH2, dAccH2, dAccV2 529 VRHADD dAccH1, dAccH1, dAccV1 530 VRHADD dAccH3, dAccH3, dAccV3 531 ADD Temp, pDst, dstStep, LSL #1 532 VST1 dResultH0[0], [pDst], dstStep 533 VST1 dResultH2[0], [Temp], dstStep 534 VST1 dResultH1[0], [pDst] 535 VST1 dResultH3[0], [Temp] 536 M_ADR pArgs, ppArgs 537 538 539 Block4x4LoopEnd 540 541 ;// Width Loop 542 ;//M_ADR pArgs, ppArgs 543 LDM pArgs, {pSrc,srcStep,pDst,dstStep} ;// Load arguments 544 SUBS iWidth, iWidth, #4 545 ADD pSrc, pSrc, #4 546 ADD pDst, pDst, #4 547 BGT Block4x4WidthLoop 548 549 ;// Height Loop 550 SUBS iHeight, iHeight, #4 551 M_LDR iWidth, ptrWidth 552 M_ADR pArgs, ppArgs 553 ADD pSrc, pSrc, srcStep, LSL #2 554 ADD pDst, pDst, dstStep, LSL #2 555 SUB pSrc, pSrc, iWidth 556 SUB pDst, pDst, iWidth 557 BGT Block4x4HeightLoop 558 559 EndOfInterpolation 560 MOV r0, #0 561 M_END 562 563 ENDIF 564 ;// End of CortexA8 565 566 END 567 568