1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 28 INCLUDE omxtypes_s.h 29 INCLUDE armCOMM_s.h 30 31 EXPORT armVCM4P10_pIndexTable8x8 32 33 ;// Define the processor variants supported by this file 34 35 M_VARIANTS CortexA8 36 37 AREA table, DATA 38 ;//------------------------------------------------------- 39 ;// This table for implementing switch case of C in asm by 40 ;// the mehtod of two levels of indexing. 41 ;//------------------------------------------------------- 42 43 M_TABLE armVCM4P10_pIndexTable8x8 44 DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR 45 DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE 46 47 M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 48 DCW 3, 2, 1,4 49 DCW -3,-2,-1,0 50 DCW 1, 2, 3,4 51 52 53 54 IF CortexA8 55 56 ;//-------------------------------------------- 57 ;// Scratch variable 58 ;//-------------------------------------------- 59 60 pc RN 15 61 return RN 0 62 pTable RN 8 63 64 ;//-------------------------------------------- 65 ;// Input Arguments 66 ;//-------------------------------------------- 67 pSrcLeft RN 0 ;// input pointer 68 pSrcAbove RN 1 ;// input pointer 69 pSrcAboveLeft RN 2 ;// input pointer 70 pDst RN 3 ;// output pointer 71 leftStep RN 4 ;// input variable 72 dstStep RN 5 ;// input variable 73 predMode RN 6 ;// input variable 74 availability RN 7 ;// input variable 75 pMultiplierTable RN 2 76 77 pTmp RN 9 78 step RN 10 79 80 ;//--------------------- 81 ;// Neon Registers 82 ;//--------------------- 83 84 ;// OMX_VC_CHROMA_HOR 85 86 dLeftVal0 DN D0.8 87 dLeftVal1 DN D1.8 88 dLeftVal2 DN D2.8 89 dLeftVal3 DN D3.8 90 dLeftVal4 DN D4.8 91 dLeftVal5 DN D5.8 92 dLeftVal6 DN D6.8 93 dLeftVal7 DN D7.8 94 95 ;// OMX_VC_CHROMA_VERT 96 97 dAboveVal DN D0.U8 98 99 ;// OMX_VC_CHROMA_DC 100 101 dLeftVal DN D1.U8 102 dSumAboveValU16 DN D2.U16 103 dSumAboveValU32 DN D3.U32 104 dSumAboveValU8 DN D3.U8 105 dSumLeftValU16 DN D2.U16 106 dSumLeftValU32 DN D1.U32 107 dSumLeftValU8 DN D1.U8 108 dSumAboveLeft DN D2.U32 109 dSumAboveLeftU8 DN D2.U8 110 dIndexRow0U8 DN D5.U8 111 dIndexRow0 DN D5.U64 112 dIndexRow4U8 DN D6.U8 113 dIndexRow4 DN D6.U64 114 dDstRow0 DN D0.U8 115 dDstRow4 DN D4.U8 116 dConst128U8 DN D0.U8 117 118 ;// OMX_VC_CHROMA_PLANE 119 120 dRevAboveVal DN D3.U8 121 dRevAboveValU64 DN D3.U64 122 dAboveLeftVal DN D2.U8 123 qAbove7minus0 QN Q3.S16 124 qAboveDiff QN Q2.S16 125 dIndex DN D8.U8 126 dDiffAboveU8 DN D9.U8 127 dDiffAboveS16 DN D9.S16 128 dAboveDiff0U8 DN D4.U8 129 dAboveDiff0U64 DN D4.U64 130 dAbove7minus0U8 DN D6.U8 131 dMultiplier DN D10.S16 132 dHorPred DN D11.S16 133 dRevLeftVal DN D3.U8 134 dRevLeftValU64 DN D3.U64 135 qLeft7minus0 QN Q7.S16 136 qLeftDiff QN Q6.S16 137 dDiffLeftU8 DN D16.U8 138 dDiffLeftS16 DN D16.S16 139 dLeftDiff0U8 DN D12.U8 140 dLeftDiff0U64 DN D12.U64 141 dLeft7minus0U8 DN D14.U8 142 dVerPred DN D3.S16 143 dHVValS16 DN D3.S16 144 dHVValS32 DN D3.S32 145 dHVTempS32 DN D2.S32 146 qA QN Q0.S16 147 qB QN Q2.S16 148 qC QN Q3.S16 149 qMultiplier QN Q5.S16 150 dMultiplier0 DN D10.S16 151 dMultiplier1 DN D11.S16 152 qC0 QN Q0.S16 153 qC1 QN Q1.S16 154 qC2 QN Q4.S16 155 qC3 QN Q5.S16 156 qC4 QN Q6.S16 157 qC5 QN Q7.S16 158 qC6 QN Q8.S16 159 qC7 QN Q9.S16 160 qSum0 QN Q0.S16 161 qSum1 QN Q1.S16 162 qSum2 QN Q4.S16 163 qSum3 QN Q5.S16 164 qSum4 QN Q6.S16 165 qSum5 QN Q7.S16 166 qSum6 QN Q8.S16 167 qSum7 QN Q9.S16 168 dSum0 DN D0.U8 169 dSum1 DN D1.U8 170 dSum2 DN D2.U8 171 dSum3 DN D3.U8 172 dSum4 DN D4.U8 173 dSum5 DN D5.U8 174 dSum6 DN D6.U8 175 dSum7 DN D7.U8 176 177 ;//----------------------------------------------------------------------------------------------- 178 ;// omxVCM4P10_PredictIntraChroma_8x8 starts 179 ;//----------------------------------------------------------------------------------------------- 180 181 ;// Write function header 182 M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15 183 184 ;// Define stack arguments 185 M_ARG LeftStep, 4 186 M_ARG DstStep, 4 187 M_ARG PredMode, 4 188 M_ARG Availability, 4 189 190 LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case 191 192 ;// Load argument from the stack 193 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 194 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 195 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 196 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 197 198 199 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 200 201 OMX_VC_CHROMA_DC 202 203 TST availability, #OMX_VC_LEFT 204 BEQ DCChroma8x8LeftNotAvailable 205 206 ADD pTmp, pSrcLeft, leftStep 207 ADD step, leftStep, leftStep 208 209 ;// Load Left Edge 210 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 211 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 212 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 213 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 214 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 215 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 216 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 217 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 218 219 TST availability, #OMX_VC_UPPER 220 BEQ DCChroma8x8LeftOnlyAvailable 221 222 ;// Load Upper Edge also 223 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 224 225 MOV return, #OMX_Sts_NoErr ;// returnNoError 226 227 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 228 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 229 230 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 231 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 232 233 VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32 234 VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3 235 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 236 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 237 238 VMOV dIndexRow0U8,#0x0c 239 VMOV dIndexRow4U8,#0x04 240 VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000 241 VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404 242 VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404 243 VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8 244 VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8 245 246 DCChroma8x8LeftStore 247 ADD pTmp, pDst, dstStep 248 ADD step, dstStep, dstStep 249 250 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 251 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 252 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 253 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 254 VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 255 VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 256 VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 257 VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 258 259 M_EXIT 260 261 262 DCChroma8x8LeftOnlyAvailable 263 264 MOV return, #OMX_Sts_NoErr 265 266 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 267 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 268 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 269 270 VDUP dDstRow0,dSumLeftValU8[0] 271 VDUP dDstRow4,dSumLeftValU8[4] 272 273 B DCChroma8x8LeftStore 274 275 276 DCChroma8x8LeftNotAvailable 277 278 TST availability, #OMX_VC_UPPER 279 BEQ DCChroma8x8NoneAvailable 280 281 ;// Load Upper Edge 282 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 283 MOV return, #OMX_Sts_NoErr ;// returnNoError 284 285 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 286 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 287 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 288 VMOV dIndexRow0U8,#0x04 289 VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000 290 VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8 291 292 B DCChroma8x8UpperStore 293 294 295 DCChroma8x8NoneAvailable 296 297 VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) 298 MOV return, #OMX_Sts_NoErr ;// returnNoError 299 300 DCChroma8x8UpperStore 301 302 ADD pTmp, pDst, dstStep 303 ADD step, dstStep, dstStep 304 305 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 306 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 307 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 308 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 309 VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 310 VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 311 VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 312 VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 313 314 M_EXIT 315 316 317 OMX_VC_CHROMA_VERT 318 319 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 320 MOV return, #OMX_Sts_NoErr 321 322 B DCChroma8x8UpperStore 323 324 325 OMX_VC_CHROMA_HOR 326 327 ADD pTmp, pSrcLeft, leftStep 328 ADD step, leftStep, leftStep 329 330 VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 331 VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep] 332 VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 333 VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep] 334 VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 335 VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep] 336 VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 337 VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep] 338 339 B DCChroma8x8PlaneStore 340 341 342 OMX_VC_CHROMA_PLANE 343 ADD pTmp, pSrcLeft, leftStep 344 ADD step, leftStep, leftStep 345 346 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 347 VLD1 dAboveLeftVal[0],[pSrcAboveLeft] 348 349 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 350 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 351 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 352 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 353 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 354 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 355 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 356 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 357 358 359 VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7] 360 VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0] 361 VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6] 362 VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0] 363 ;// pSrcAbove[5] - pSrcAbove[1] 364 ;// pSrcAbove[4] - pSrcAbove[2] 365 366 VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7] 367 VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 368 VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6] 369 VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0] 370 ;// pSrcLeft[5] - pSrcLeft[1] 371 ;// pSrcLeft[4] - pSrcLeft[2] 372 373 LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval 374 VSHL dAboveDiff0U64,dAboveDiff0U64,#16 375 VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ] 376 VLD1 dMultiplier,[pMultiplierTable]! 377 VSHL dLeftDiff0U64,dLeftDiff0U64,#16 378 VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ] 379 380 381 VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ] 382 VMUL dVerPred,dDiffLeftS16,dMultiplier 383 VPADD dHVValS16,dHorPred,dVerPred 384 385 386 VPADDL dHVValS32,dHVValS16 ;// [V|H] in 32 bits each 387 VSHL dHVTempS32,dHVValS32,#4 ;// 17*H = 16*H + H = (H<<4)+H 388 VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17*V | 17*H ]in 32 bits each 389 VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ] 390 VRSHR dHVValS32,dHVValS32,#5 ;// [c|b] in 16bits each 391 VADDL qA,dAboveVal,dLeftVal 392 VDUP qA,qA[7] 393 VSHL qA,qA,#4 ;// [a|a|a|a|a|a|a|a] 394 VDUP qB,dHVValS16[0] ;// [b|b|b|b|b|b|b|b] 395 VDUP qC,dHVValS16[2] ;// [c|c|c|c|c|c|c|c] 396 397 398 VMUL qB,qB,qMultiplier 399 VMUL qC,qC,qMultiplier 400 VADD qB,qB,qA 401 402 VDUP qC0,qC[0] 403 VDUP qC1,qC[1] 404 VDUP qC2,qC[2] 405 VDUP qC3,qC[3] 406 VDUP qC4,qC[4] 407 VDUP qC5,qC[5] 408 VDUP qC6,qC[6] 409 VDUP qC7,qC[7] 410 411 VADD qSum0,qB,qC0 412 VADD qSum1,qB,qC1 413 VADD qSum2,qB,qC2 414 VADD qSum3,qB,qC3 415 VADD qSum4,qB,qC4 416 VADD qSum5,qB,qC5 417 VADD qSum6,qB,qC6 418 VADD qSum7,qB,qC7 419 420 VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5) 421 VQRSHRUN dSum1,qSum1,#5 422 VQRSHRUN dSum2,qSum2,#5 423 VQRSHRUN dSum3,qSum3,#5 424 VQRSHRUN dSum4,qSum4,#5 425 VQRSHRUN dSum5,qSum5,#5 426 VQRSHRUN dSum6,qSum6,#5 427 VQRSHRUN dSum7,qSum7,#5 428 429 DCChroma8x8PlaneStore 430 ADD pTmp, pDst, dstStep 431 ADD step, dstStep, dstStep 432 433 VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 434 VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 435 VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 436 VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 437 VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 438 VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 439 VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 440 VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 441 442 MOV return, #OMX_Sts_NoErr 443 M_END 444 445 ENDIF ;// CortexA8 446 447 END 448 ;//----------------------------------------------------------------------------------------------- 449 ;// omxVCM4P10_PredictIntraChroma_8x8 ends 450 ;//----------------------------------------------------------------------------------------------- 451