1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_PredictIntra_4x4_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 14 INCLUDE omxtypes_s.h 15 INCLUDE armCOMM_s.h 16 17 ;// Define the processor variants supported by this file 18 19 M_VARIANTS CortexA8 20 21 ;//------------------------------------------------------- 22 ;// This table for implementing switch case of C in asm by 23 ;// the mehtod of two levels of indexing. 24 ;//------------------------------------------------------- 25 26 M_TABLE armVCM4P10_pSwitchTable4x4 27 DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR 28 DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL 29 DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR 30 DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL 31 DCD OMX_VC_4x4_HU 32 33 34 IF CortexA8 35 36 ;//-------------------------------------------- 37 ;// Scratch variable 38 ;//-------------------------------------------- 39 return RN 0 40 pTable RN 8 41 pc RN 15 42 43 ;//-------------------------------------------- 44 ;// Declare input registers 45 ;//-------------------------------------------- 46 pSrcLeft RN 0 ;// input pointer 47 pSrcAbove RN 1 ;// input pointer 48 pSrcAboveLeft RN 2 ;// input pointer 49 pDst RN 3 ;// output pointer 50 leftStep RN 4 ;// input variable 51 dstStep RN 5 ;// input variable 52 predMode RN 6 ;// input variable 53 availability RN 7 ;// input variable 54 pDst1 RN 1 55 pDst2 RN 4 56 pDst3 RN 6 57 58 pSrcTmp RN 9 59 srcStep RN 10 60 pDstTmp RN 11 61 dstep RN 12 62 63 ;//------------------- 64 ;// Neon registers 65 ;//------------------- 66 67 ;// OMX_VC_CHROMA_VERT 68 dAboveU32 DN D0.U32 69 70 ;// OMX_VC_CHROMA_HOR 71 dLeftVal0 DN D0.8 72 dLeftVal1 DN D1.8 73 dLeftVal2 DN D2.8 74 dLeftVal3 DN D3.8 75 dLeftVal0U32 DN D0.U32 76 dLeftVal1U32 DN D1.U32 77 dLeftVal2U32 DN D2.U32 78 dLeftVal3U32 DN D3.U32 79 80 ;// OMX_VC_4x4_DC 81 dLeftVal DN D0.U8 82 dLeftValU32 DN D0.U32 83 dSumAboveLeftU16 DN D1.U16 84 dSumAboveLeftU32 DN D1.U32 85 dSumAboveLeftU64 DN D1.U64 86 dSumAboveLeftU8 DN D1.U8 87 dSum DN D0.U8 88 89 dSumLeftValU16 DN D1.U16 90 dSumLeftValU32 DN D1.U32 91 dSumLeftValU64 DN D1.U64 92 dSumLeftValU8 DN D1.U8 93 94 dAboveVal DN D0.U8 95 dSumAboveValU16 DN D1.U16 96 dSumAboveValU32 DN D1.U32 97 dSumAboveValU64 DN D1.U64 98 dSumAboveValU8 DN D1.U8 99 dConst128U8 DN D0.U8 100 101 102 ;//OMX_VC_4x4_DIAG_DL 103 104 dAbove DN D0.U8 105 dU7 DN D2.U8 106 dU3 DN D2.U8 107 dAbove0 DN D3.U8 108 dAbove1 DN D4.U8 109 dAbove2 DN D5.U8 110 dTmp DN D6.U8 111 dTmp0 DN D7.U8 112 dTmp1 DN D8.U8 113 dTmp2 DN D9.U8 114 dTmp3 DN D10.U8 115 dTmpU32 DN D6.U32 116 117 118 ;//OMX_VC_4x4_DIAG_DR 119 dLeft DN D1.U8 120 dUL DN D2.U8 121 122 ;//OMX_VC_4x4_VR 123 dLeft0 DN D1.U8 124 dLeft1 DN D2.U8 125 dEven0 DN D3.U8 126 dEven1 DN D4.U8 127 dEven2 DN D5.U8 128 dOdd0 DN D6.U8 129 dOdd1 DN D11.U8 130 dOdd2 DN D12.U8 131 dTmp3U32 DN D10.U32 132 dTmp2U32 DN D9.U32 133 134 135 ;//OMX_VC_4x4_HD 136 dTmp1U64 DN D8.U64 137 dTmp0U64 DN D7.U64 138 dTmpU64 DN D6.U64 139 dTmpU32 DN D6.U32 140 dTmp1U32 DN D8.U32 141 142 ;//OMX_VC_4x4_HU 143 dL3 DN D2.U8 144 dLeftHU0 DN D3.U8 145 dLeftHU1 DN D4.U8 146 dLeftHU2 DN D5.U8 147 dTmp0U32 DN D7.U32 148 149 150 151 152 ;//----------------------------------------------------------------------------------------------- 153 ;// omxVCM4P10_PredictIntra_4x4 starts 154 ;//----------------------------------------------------------------------------------------------- 155 156 ;// Write function header 157 M_START omxVCM4P10_PredictIntra_4x4, r12,d12 158 159 ;// Define stack arguments 160 M_ARG LeftStep, 4 161 M_ARG DstStep, 4 162 M_ARG PredMode, 4 163 M_ARG Availability, 4 164 165 166 LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case 167 168 ;// Load argument from the stack 169 M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg 170 M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg 171 172 173 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 174 175 176 OMX_VC_4x4_HOR 177 178 ADD pSrcTmp, pSrcLeft, leftStep 179 ADD srcStep, leftStep, leftStep 180 ;// Load Left Edge 181 VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] 182 VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] 183 VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep] 184 VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep] 185 186 ADD pDstTmp, pDst, dstStep 187 ADD dstep, dstStep, dstStep 188 189 VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7 190 VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7 191 VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7 192 VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7 193 194 B ExitPredict4x4 ;// Branch to exit code 195 196 OMX_VC_4x4_VERT 197 198 ;// Load Upper Edge 199 VLD1 dAboveU32[0],[pSrcAbove] 200 ADD pDstTmp, pDst, dstStep 201 ADD dstep, dstStep, dstStep 202 203 DCPredict4x4VertStore 204 205 VST1 dAboveU32[0],[pDst],dstep 206 VST1 dAboveU32[0],[pDstTmp],dstep 207 VST1 dAboveU32[0],[pDst] 208 VST1 dAboveU32[0],[pDstTmp] 209 210 B ExitPredict4x4 ;// Branch to exit code 211 212 OMX_VC_4x4_DC 213 214 215 TST availability, #OMX_VC_LEFT 216 BEQ DCPredict4x4LeftNotAvailable 217 218 ADD pSrcTmp, pSrcLeft, leftStep 219 ADD srcStep, leftStep, leftStep 220 ;// Load Left Edge 221 VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] 222 VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] 223 VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep] 224 VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] 225 226 TST availability, #OMX_VC_UPPER 227 BEQ DCPredict4x4LeftOnlyAvailable 228 229 ;// Load Upper Edge also 230 VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3] 231 MOV return, #OMX_Sts_NoErr 232 233 VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]] 234 VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]] 235 VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]] 236 VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3 237 ADD pDstTmp, pDst, dstStep 238 ADD dstep, dstStep, dstStep 239 VDUP dSum,dSumAboveLeftU8[0] 240 241 B DCPredict4x4VertStore 242 243 DCPredict4x4LeftOnlyAvailable 244 245 MOV return, #OMX_Sts_NoErr ;// returnNoError 246 247 VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]] 248 VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]] 249 250 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 251 ADD pDstTmp, pDst, dstStep 252 ADD dstep, dstStep, dstStep 253 VDUP dSum,dSumLeftValU8[0] 254 255 B DCPredict4x4VertStore 256 257 DCPredict4x4LeftNotAvailable 258 259 TST availability, #OMX_VC_UPPER 260 BEQ DCPredict4x4NoneAvailable 261 262 ;// Load Upper Edge 263 VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3] 264 MOV return, #OMX_Sts_NoErr 265 266 VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]] 267 VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]] 268 269 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 270 ADD pDstTmp, pDst, dstStep 271 ADD dstep, dstStep, dstStep 272 VDUP dSum,dSumAboveValU8[0] 273 274 B DCPredict4x4VertStore 275 276 DCPredict4x4NoneAvailable 277 278 VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) 279 MOV return, #OMX_Sts_NoErr 280 281 ADD pDstTmp, pDst, dstStep 282 ADD dstep, dstStep, dstStep 283 B DCPredict4x4VertStore 284 285 286 287 OMX_VC_4x4_DIAG_DL 288 289 TST availability, #OMX_VC_UPPER_RIGHT 290 BEQ DiagDLUpperRightNotAvailable 291 292 VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0] 293 VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7] 294 VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1] 295 VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2] 296 B DiagDLPredict4x4Store 297 298 DiagDLUpperRightNotAvailable 299 VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-] 300 VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3] 301 302 VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0] 303 VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1] 304 VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2] 305 306 DiagDLPredict4x4Store 307 308 VHADD dTmp, dAbove0, dAbove2 309 VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2 310 311 312 VST1 dTmpU32[0],[pDst],dstStep 313 VEXT dTmp,dTmp,dTmp,#1 314 VST1 dTmpU32[0],[pDst],dstStep 315 VEXT dTmp,dTmp,dTmp,#1 316 VST1 dTmpU32[0],[pDst],dstStep 317 VEXT dTmp,dTmp,dTmp,#1 318 VST1 dTmpU32[0],[pDst] 319 320 B ExitPredict4x4 ;// Branch to exit code 321 322 323 OMX_VC_4x4_DIAG_DR 324 325 326 ;// Load U0,U1,U2,U3 327 328 VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0] 329 330 ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] 331 VLD1 {dLeft[7]},[pSrcAboveLeft] 332 ADD pSrcTmp, pSrcLeft, leftStep 333 ADD srcStep, leftStep, leftStep 334 ADD pDst1,pDst,dstStep 335 336 VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] 337 VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] 338 VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep] 339 VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] 340 341 342 VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3] 343 ADD pDst2,pDst1,dstStep 344 VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2] 345 ADD pDst3,pDst2,dstStep 346 VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1] 347 348 VHADD dTmp, dAbove0, dAbove2 349 VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2 350 351 352 VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3 353 VEXT dTmp,dTmp,dTmp,#1 354 VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2 355 VEXT dTmp,dTmp,dTmp,#1 356 VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1 357 VEXT dTmp,dTmp,dTmp,#1 358 VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst 359 360 B ExitPredict4x4 ;// Branch to exit code 361 362 OMX_VC_4x4_VR 363 364 365 ;// Load UL,U0,U1,U2,U3 366 VLD1 dAboveU32[0],[pSrcAbove] 367 VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0] 368 369 ;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X] 370 ;// dLeft1 = [L1| X|X|X|X|X|X|X] 371 VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep] 372 VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep] 373 VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep] 374 375 376 VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ] 377 VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ] 378 VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ] 379 VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ] 380 VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ] 381 VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ] 382 383 VHADD dTmp1, dOdd0, dOdd2 384 VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ] 385 386 VHADD dTmp0, dEven0, dEven2 387 VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ] 388 389 390 VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ] 391 ADD pDstTmp, pDst, dstStep 392 ADD dstep, dstStep, dstStep 393 VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ] 394 395 396 VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3] 397 VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2] 398 VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1] 399 VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0] 400 401 B ExitPredict4x4 ;// Branch to exit code 402 403 OMX_VC_4x4_HD 404 405 406 ;// Load U0,U1,U2,U3 407 VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0] 408 409 ;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] 410 VLD1 {dLeft[7]},[pSrcAboveLeft] 411 ADD pSrcTmp, pSrcLeft, leftStep 412 ADD srcStep, leftStep, leftStep 413 414 VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] 415 VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] 416 VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep] 417 VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep] 418 419 VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ] 420 VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ] 421 VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ] 422 423 VHADD dTmp0, dAbove0, dAbove2 424 VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ] 425 426 427 VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1 428 VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ] 429 430 431 VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ] 432 VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ] 433 VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ] 434 VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ] 435 436 ADD pDstTmp, pDst, dstStep 437 ADD dstep, dstStep, dstStep 438 439 VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3] 440 VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5] 441 VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7] 442 VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9] 443 444 B ExitPredict4x4 ;// Branch to exit code 445 446 OMX_VC_4x4_VL 447 448 449 TST availability, #OMX_VC_UPPER_RIGHT 450 BEQ DiagVLUpperRightNotAvailable 451 452 VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0] 453 VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1] 454 VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2] 455 456 B DiagVLPredict4x4Store 457 458 DiagVLUpperRightNotAvailable 459 VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-] 460 VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3] 461 462 VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0] 463 VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1] 464 VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2] 465 466 DiagVLPredict4x4Store 467 468 VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1 469 ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ] 470 471 VHADD dTmp3, dAbove0, dAbove2 472 VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2 473 ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ] 474 475 VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ] 476 ADD pDstTmp, pDst, dstStep 477 ADD dstep, dstStep, dstStep 478 VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ] 479 480 VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0] 481 VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1] 482 VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2] 483 VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3] 484 485 B ExitPredict4x4 ;// Branch to exit code 486 487 OMX_VC_4x4_HU 488 ADD pSrcTmp, pSrcLeft, leftStep 489 ADD srcStep, leftStep, leftStep 490 491 ;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X] 492 VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep] 493 VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep] 494 VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep] 495 VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep] 496 497 VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3] 498 499 VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0] 500 VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1] 501 VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2] 502 503 VHADD dTmp0, dLeftHU0, dLeftHU2 504 VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ] 505 506 VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1 507 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ] 508 509 VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0] 510 ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3] 511 512 513 VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0] 514 VEXT dTmp1,dTmp1,dTmp1,#2 515 VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2] 516 VEXT dTmp1,dTmp1,dTmp1,#2 517 VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4] 518 VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6] 519 520 521 ExitPredict4x4 522 523 MOV return, #OMX_Sts_NoErr 524 M_END 525 526 ENDIF ;// CortexA8 527 528 END 529 ;//----------------------------------------------------------------------------------------------- 530 ;// omxVCM4P10_PredictIntra_4x4 ends 531 ;//----------------------------------------------------------------------------------------------- 532