1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 14 INCLUDE omxtypes_s.h 15 INCLUDE armCOMM_s.h 16 17 EXPORT armVCM4P10_pIndexTable8x8 18 19 ;// Define the processor variants supported by this file 20 21 M_VARIANTS CortexA8 22 23 AREA table, DATA 24 ;//------------------------------------------------------- 25 ;// This table for implementing switch case of C in asm by 26 ;// the mehtod of two levels of indexing. 27 ;//------------------------------------------------------- 28 29 M_TABLE armVCM4P10_pIndexTable8x8 30 DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR 31 DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE 32 33 M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 34 DCW 3, 2, 1,4 35 DCW -3,-2,-1,0 36 DCW 1, 2, 3,4 37 38 39 40 IF CortexA8 41 42 ;//-------------------------------------------- 43 ;// Scratch variable 44 ;//-------------------------------------------- 45 46 pc RN 15 47 return RN 0 48 pTable RN 8 49 50 ;//-------------------------------------------- 51 ;// Input Arguments 52 ;//-------------------------------------------- 53 pSrcLeft RN 0 ;// input pointer 54 pSrcAbove RN 1 ;// input pointer 55 pSrcAboveLeft RN 2 ;// input pointer 56 pDst RN 3 ;// output pointer 57 leftStep RN 4 ;// input variable 58 dstStep RN 5 ;// input variable 59 predMode RN 6 ;// input variable 60 availability RN 7 ;// input variable 61 pMultiplierTable RN 2 62 63 pTmp RN 9 64 step RN 10 65 66 ;//--------------------- 67 ;// Neon Registers 68 ;//--------------------- 69 70 ;// OMX_VC_CHROMA_HOR 71 72 dLeftVal0 DN D0.8 73 dLeftVal1 DN D1.8 74 dLeftVal2 DN D2.8 75 dLeftVal3 DN D3.8 76 dLeftVal4 DN D4.8 77 dLeftVal5 DN D5.8 78 dLeftVal6 DN D6.8 79 dLeftVal7 DN D7.8 80 81 ;// OMX_VC_CHROMA_VERT 82 83 dAboveVal DN D0.U8 84 85 ;// OMX_VC_CHROMA_DC 86 87 dLeftVal DN D1.U8 88 dSumAboveValU16 DN D2.U16 89 dSumAboveValU32 DN D3.U32 90 dSumAboveValU8 DN D3.U8 91 dSumLeftValU16 DN D2.U16 92 dSumLeftValU32 DN D1.U32 93 dSumLeftValU8 DN D1.U8 94 dSumAboveLeft DN D2.U32 95 dSumAboveLeftU8 DN D2.U8 96 dIndexRow0U8 DN D5.U8 97 dIndexRow0 DN D5.U64 98 dIndexRow4U8 DN D6.U8 99 dIndexRow4 DN D6.U64 100 dDstRow0 DN D0.U8 101 dDstRow4 DN D4.U8 102 dConst128U8 DN D0.U8 103 104 ;// OMX_VC_CHROMA_PLANE 105 106 dRevAboveVal DN D3.U8 107 dRevAboveValU64 DN D3.U64 108 dAboveLeftVal DN D2.U8 109 qAbove7minus0 QN Q3.S16 110 qAboveDiff QN Q2.S16 111 dIndex DN D8.U8 112 dDiffAboveU8 DN D9.U8 113 dDiffAboveS16 DN D9.S16 114 dAboveDiff0U8 DN D4.U8 115 dAboveDiff0U64 DN D4.U64 116 dAbove7minus0U8 DN D6.U8 117 dMultiplier DN D10.S16 118 dHorPred DN D11.S16 119 dRevLeftVal DN D3.U8 120 dRevLeftValU64 DN D3.U64 121 qLeft7minus0 QN Q7.S16 122 qLeftDiff QN Q6.S16 123 dDiffLeftU8 DN D16.U8 124 dDiffLeftS16 DN D16.S16 125 dLeftDiff0U8 DN D12.U8 126 dLeftDiff0U64 DN D12.U64 127 dLeft7minus0U8 DN D14.U8 128 dVerPred DN D3.S16 129 dHVValS16 DN D3.S16 130 dHVValS32 DN D3.S32 131 dHVTempS32 DN D2.S32 132 qA QN Q0.S16 133 qB QN Q2.S16 134 qC QN Q3.S16 135 qMultiplier QN Q5.S16 136 dMultiplier0 DN D10.S16 137 dMultiplier1 DN D11.S16 138 qC0 QN Q0.S16 139 qC1 QN Q1.S16 140 qC2 QN Q4.S16 141 qC3 QN Q5.S16 142 qC4 QN Q6.S16 143 qC5 QN Q7.S16 144 qC6 QN Q8.S16 145 qC7 QN Q9.S16 146 qSum0 QN Q0.S16 147 qSum1 QN Q1.S16 148 qSum2 QN Q4.S16 149 qSum3 QN Q5.S16 150 qSum4 QN Q6.S16 151 qSum5 QN Q7.S16 152 qSum6 QN Q8.S16 153 qSum7 QN Q9.S16 154 dSum0 DN D0.U8 155 dSum1 DN D1.U8 156 dSum2 DN D2.U8 157 dSum3 DN D3.U8 158 dSum4 DN D4.U8 159 dSum5 DN D5.U8 160 dSum6 DN D6.U8 161 dSum7 DN D7.U8 162 163 ;//----------------------------------------------------------------------------------------------- 164 ;// omxVCM4P10_PredictIntraChroma_8x8 starts 165 ;//----------------------------------------------------------------------------------------------- 166 167 ;// Write function header 168 M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15 169 170 ;// Define stack arguments 171 M_ARG LeftStep, 4 172 M_ARG DstStep, 4 173 M_ARG PredMode, 4 174 M_ARG Availability, 4 175 176 LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case 177 178 ;// Load argument from the stack 179 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 180 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 181 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 182 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 183 184 185 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 186 187 OMX_VC_CHROMA_DC 188 189 TST availability, #OMX_VC_LEFT 190 BEQ DCChroma8x8LeftNotAvailable 191 192 ADD pTmp, pSrcLeft, leftStep 193 ADD step, leftStep, leftStep 194 195 ;// Load Left Edge 196 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 197 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 198 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 199 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 200 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 201 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 202 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 203 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 204 205 TST availability, #OMX_VC_UPPER 206 BEQ DCChroma8x8LeftOnlyAvailable 207 208 ;// Load Upper Edge also 209 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 210 211 MOV return, #OMX_Sts_NoErr ;// returnNoError 212 213 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 214 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 215 216 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 217 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 218 219 VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32 220 VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3 221 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 222 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 223 224 VMOV dIndexRow0U8,#0x0c 225 VMOV dIndexRow4U8,#0x04 226 VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000 227 VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404 228 VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404 229 VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8 230 VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8 231 232 DCChroma8x8LeftStore 233 ADD pTmp, pDst, dstStep 234 ADD step, dstStep, dstStep 235 236 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 237 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 238 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 239 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 240 VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 241 VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 242 VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 243 VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 244 245 M_EXIT 246 247 248 DCChroma8x8LeftOnlyAvailable 249 250 MOV return, #OMX_Sts_NoErr 251 252 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 253 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 254 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 255 256 VDUP dDstRow0,dSumLeftValU8[0] 257 VDUP dDstRow4,dSumLeftValU8[4] 258 259 B DCChroma8x8LeftStore 260 261 262 DCChroma8x8LeftNotAvailable 263 264 TST availability, #OMX_VC_UPPER 265 BEQ DCChroma8x8NoneAvailable 266 267 ;// Load Upper Edge 268 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 269 MOV return, #OMX_Sts_NoErr ;// returnNoError 270 271 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 272 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 273 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 274 VMOV dIndexRow0U8,#0x04 275 VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000 276 VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8 277 278 B DCChroma8x8UpperStore 279 280 281 DCChroma8x8NoneAvailable 282 283 VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) 284 MOV return, #OMX_Sts_NoErr ;// returnNoError 285 286 DCChroma8x8UpperStore 287 288 ADD pTmp, pDst, dstStep 289 ADD step, dstStep, dstStep 290 291 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 292 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 293 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 294 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 295 VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 296 VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 297 VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 298 VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 299 300 M_EXIT 301 302 303 OMX_VC_CHROMA_VERT 304 305 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 306 MOV return, #OMX_Sts_NoErr 307 308 B DCChroma8x8UpperStore 309 310 311 OMX_VC_CHROMA_HOR 312 313 ADD pTmp, pSrcLeft, leftStep 314 ADD step, leftStep, leftStep 315 316 VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 317 VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep] 318 VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 319 VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep] 320 VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 321 VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep] 322 VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 323 VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep] 324 325 B DCChroma8x8PlaneStore 326 327 328 OMX_VC_CHROMA_PLANE 329 ADD pTmp, pSrcLeft, leftStep 330 ADD step, leftStep, leftStep 331 332 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 333 VLD1 dAboveLeftVal[0],[pSrcAboveLeft] 334 335 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 336 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 337 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 338 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 339 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 340 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 341 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 342 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 343 344 345 VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7] 346 VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0] 347 VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6] 348 VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0] 349 ;// pSrcAbove[5] - pSrcAbove[1] 350 ;// pSrcAbove[4] - pSrcAbove[2] 351 352 VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7] 353 VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 354 VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6] 355 VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0] 356 ;// pSrcLeft[5] - pSrcLeft[1] 357 ;// pSrcLeft[4] - pSrcLeft[2] 358 359 LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval 360 VSHL dAboveDiff0U64,dAboveDiff0U64,#16 361 VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ] 362 VLD1 dMultiplier,[pMultiplierTable]! 363 VSHL dLeftDiff0U64,dLeftDiff0U64,#16 364 VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ] 365 366 367 VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ] 368 VMUL dVerPred,dDiffLeftS16,dMultiplier 369 VPADD dHVValS16,dHorPred,dVerPred 370 371 372 VPADDL dHVValS32,dHVValS16 ;// [V|H] in 32 bits each 373 VSHL dHVTempS32,dHVValS32,#4 ;// 17*H = 16*H + H = (H<<4)+H 374 VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17*V | 17*H ]in 32 bits each 375 VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ] 376 VRSHR dHVValS32,dHVValS32,#5 ;// [c|b] in 16bits each 377 VADDL qA,dAboveVal,dLeftVal 378 VDUP qA,qA[7] 379 VSHL qA,qA,#4 ;// [a|a|a|a|a|a|a|a] 380 VDUP qB,dHVValS16[0] ;// [b|b|b|b|b|b|b|b] 381 VDUP qC,dHVValS16[2] ;// [c|c|c|c|c|c|c|c] 382 383 384 VMUL qB,qB,qMultiplier 385 VMUL qC,qC,qMultiplier 386 VADD qB,qB,qA 387 388 VDUP qC0,qC[0] 389 VDUP qC1,qC[1] 390 VDUP qC2,qC[2] 391 VDUP qC3,qC[3] 392 VDUP qC4,qC[4] 393 VDUP qC5,qC[5] 394 VDUP qC6,qC[6] 395 VDUP qC7,qC[7] 396 397 VADD qSum0,qB,qC0 398 VADD qSum1,qB,qC1 399 VADD qSum2,qB,qC2 400 VADD qSum3,qB,qC3 401 VADD qSum4,qB,qC4 402 VADD qSum5,qB,qC5 403 VADD qSum6,qB,qC6 404 VADD qSum7,qB,qC7 405 406 VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5) 407 VQRSHRUN dSum1,qSum1,#5 408 VQRSHRUN dSum2,qSum2,#5 409 VQRSHRUN dSum3,qSum3,#5 410 VQRSHRUN dSum4,qSum4,#5 411 VQRSHRUN dSum5,qSum5,#5 412 VQRSHRUN dSum6,qSum6,#5 413 VQRSHRUN dSum7,qSum7,#5 414 415 DCChroma8x8PlaneStore 416 ADD pTmp, pDst, dstStep 417 ADD step, dstStep, dstStep 418 419 VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 420 VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 421 VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 422 VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 423 VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 424 VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 425 VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 426 VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 427 428 MOV return, #OMX_Sts_NoErr 429 M_END 430 431 ENDIF ;// CortexA8 432 433 END 434 ;//----------------------------------------------------------------------------------------------- 435 ;// omxVCM4P10_PredictIntraChroma_8x8 ends 436 ;//----------------------------------------------------------------------------------------------- 437