1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_PredictIntra_16x16_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 19 ;//------------------------------------------------------- 20 ;// This table for implementing switch case of C in asm by 21 ;// the mehtod of two levels of indexing. 22 ;//------------------------------------------------------- 23 24 M_TABLE armVCM4P10_pIndexTable16x16 25 DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 26 DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE 27 28 29 IF CortexA8 30 31 M_TABLE armVCM4P10_MultiplierTable16x16,1 32 DCW 7, 6, 5, 4, 3, 2, 1, 8 33 DCW 0, 1, 2, 3, 4, 5, 6, 7 34 DCW 8, 9, 10, 11, 12, 13, 14, 15 35 36 ;//-------------------------------------------- 37 ;// Constants 38 ;//-------------------------------------------- 39 BLK_SIZE EQU 0x10 40 MUL_CONST0 EQU 0x01010101 41 MUL_CONST1 EQU 0x00060004 42 MUL_CONST2 EQU 0x00070005 43 MUL_CONST3 EQU 0x00030001 44 MASK_CONST EQU 0x00FF00FF 45 46 ;//-------------------------------------------- 47 ;// Scratch variable 48 ;//-------------------------------------------- 49 y RN 12 50 pc RN 15 51 52 return RN 0 53 pTable RN 9 54 count RN 11 55 pMultTable RN 9 56 ; ---------------------------------------------- 57 ; Neon registers 58 ; ---------------------------------------------- 59 qAbove QN Q0.U8 60 qLeft QN Q1.U8 61 qSum8 QN Q0.U16 62 dSum80 DN D0.U16 63 dSum81 DN D1.U16 64 dSum4 DN D0.U16 65 dSum2 DN D0.U32 66 dSum1 DN D0.U64 67 qOut QN Q3.U8 68 dSumLeft DN D6.U64 69 dSumAbove DN D7.U64 70 dSum DN D8.U64 71 dSum0 DN D8.U8[0] 72 73 qH QN Q11.S32 74 qV QN Q12.S32 75 qA QN Q11.S16 76 qB QN Q6.S16 77 qC QN Q7.S16 78 79 qB0 QN Q5.S16 80 qB1 QN Q6.S16 81 dA1 DN D23.S16 82 83 dH0 DN D22.S32 84 dH1 DN D23.S32 85 dV0 DN D24.S32 86 dV1 DN D25.S32 87 88 qHV QN Q11.S64 89 qHV0 QN Q11.S32 90 qHV1 QN Q12.S64 91 92 dHV00 DN D22.S32 93 dHV01 DN D23.S32 94 95 dHV0 DN D22.S16[0] 96 dHV1 DN D23.S16[0] 97 dHV10 DN D24.S64 98 dHV11 DN D25.S64 99 100 qSum0 QN Q0.S16 101 qSum1 QN Q1.S16 102 103 dOut0 DN D6.U8 104 dOut1 DN D7.U8 105 106 dLeft0 DN D2.U8 107 dLeft1 DN D3.U8 108 qConst QN Q13.S16 109 110 dAbove0 DN D0.U8 111 dAbove1 DN D1.U8 112 113 dRevLeft64 DN D12.U64 114 dRevLeft DN D12.U8 115 dRevAbove64 DN D5.U64 116 dRevAbove DN D5.U8 117 qLeftDiff QN Q8.S16 118 dLeftDiff1 DN D17.S16 119 dLeftDiff64 DN D17.S64 120 qDiffLeft QN Q8.S16 121 qDiffAbove QN Q4.S16 122 dAboveDiff1 DN D9.S16 123 dAboveDiff64 DN D9.S64 124 qAboveDiff QN Q4.S16 125 126 dAboveLeft DN D4.U8 127 128 dDiffLeft0 DN D16.S16 129 dDiffLeft1 DN D17.S16 130 dDiffAbove0 DN D8.S16 131 dDiffAbove1 DN D9.S16 132 133 qLeft15minus0 QN Q7.S16 134 dLeft15minus0 DN D14.S16 135 qAbove15minus0 QN Q3.S16 136 dAbove15minus0 DN D6.S16 137 138 qMultiplier QN Q10.S16 139 qMultiplier0 QN Q10.S16 140 qMultiplier1 QN Q12.S16 141 dMultiplier0 DN D20.S16 142 dMultiplier1 DN D21.S16 143 144 dBPlusCMult7 DN D1.S64 145 dBPlusCMult7S16 DN D1.S16 146 147 qTmp QN Q0.U8 148 149 ;//-------------------------------------------- 150 ;// Declare input registers 151 ;//-------------------------------------------- 152 pSrcLeft RN 0 ;// input pointer 153 pSrcAbove RN 1 ;// input pointer 154 pSrcAboveLeft RN 2 ;// input pointer 155 pDst RN 3 ;// output pointer 156 leftStep RN 4 ;// input variable 157 dstStep RN 5 ;// input variable 158 predMode RN 6 ;// input variable 159 availability RN 7 ;// input variable 160 161 pTmp RN 8 162 step RN 10 163 pTmp2 RN 11 164 165 ;//----------------------------------------------------------------------------------------------- 166 ;// omxVCM4P10_PredictIntra_16x16 starts 167 ;//----------------------------------------------------------------------------------------------- 168 169 ;// Write function header 170 M_START omxVCM4P10_PredictIntra_16x16, r11, d15 171 172 ;// Define stack arguments 173 M_ARG LeftStep, 4 174 M_ARG DstStep, 4 175 M_ARG PredMode, 4 176 M_ARG Availability, 4 177 178 ;// M_STALL ARM1136JS=4 179 180 LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case 181 182 ;// Load argument from the stack 183 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 184 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 185 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 186 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 187 188 MOV y, #BLK_SIZE ;// Outer Loop Count 189 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 190 191 OMX_VC_16X16_VERT 192 VLD1 qAbove, [pSrcAbove] 193 ADD pTmp, pDst, dstStep 194 ADD step, dstStep, dstStep 195 VST1 qAbove, [pDst], step 196 VST1 qAbove, [pTmp], step 197 VST1 qAbove, [pDst], step 198 VST1 qAbove, [pTmp], step 199 VST1 qAbove, [pDst], step 200 VST1 qAbove, [pTmp], step 201 VST1 qAbove, [pDst], step 202 VST1 qAbove, [pTmp], step 203 VST1 qAbove, [pDst], step 204 VST1 qAbove, [pTmp], step 205 VST1 qAbove, [pDst], step 206 VST1 qAbove, [pTmp], step 207 VST1 qAbove, [pDst], step 208 VST1 qAbove, [pTmp], step 209 VST1 qAbove, [pDst] 210 VST1 qAbove, [pTmp] 211 MOV return, #OMX_Sts_NoErr ;// returnNoError 212 M_EXIT 213 214 OMX_VC_16X16_HOR 215 ADD pTmp, pSrcLeft, leftStep 216 ADD leftStep, leftStep, leftStep 217 ADD pTmp2, pDst, dstStep 218 ADD dstStep, dstStep, dstStep 219 LoopHor 220 VLD1 {qLeft[]}, [pSrcLeft], leftStep 221 VLD1 {qTmp[]}, [pTmp], leftStep 222 SUBS y, y, #8 223 VST1 qLeft, [pDst], dstStep 224 VST1 qTmp, [pTmp2], dstStep 225 VLD1 {qLeft[]}, [pSrcLeft], leftStep 226 VLD1 {qTmp[]}, [pTmp], leftStep 227 VST1 qLeft, [pDst], dstStep 228 VST1 qTmp, [pTmp2], dstStep 229 VLD1 {qLeft[]}, [pSrcLeft], leftStep 230 VLD1 {qTmp[]}, [pTmp], leftStep 231 VST1 qLeft, [pDst], dstStep 232 VST1 qTmp, [pTmp2], dstStep 233 VLD1 {qLeft[]}, [pSrcLeft], leftStep 234 VLD1 {qTmp[]}, [pTmp], leftStep 235 VST1 qLeft, [pDst], dstStep 236 VST1 qTmp, [pTmp2], dstStep 237 238 BNE LoopHor ;// Loop for 16 times 239 MOV return, #OMX_Sts_NoErr 240 M_EXIT 241 242 OMX_VC_16X16_DC 243 MOV count, #0 ;// count = 0 244 TST availability, #OMX_VC_LEFT 245 BEQ UpperOrNoneAvailable ;// Jump to Upper if not left 246 247 ADD pTmp, pSrcLeft, leftStep 248 ADD step, leftStep, leftStep 249 250 VLD1 {qLeft[0]}, [pSrcLeft],step 251 VLD1 {qLeft[1]}, [pTmp],step 252 VLD1 {qLeft[2]}, [pSrcLeft],step 253 VLD1 {qLeft[3]}, [pTmp],step 254 VLD1 {qLeft[4]}, [pSrcLeft],step 255 VLD1 {qLeft[5]}, [pTmp],step 256 VLD1 {qLeft[6]}, [pSrcLeft],step 257 VLD1 {qLeft[7]}, [pTmp],step 258 VLD1 {qLeft[8]}, [pSrcLeft],step 259 VLD1 {qLeft[9]}, [pTmp],step 260 VLD1 {qLeft[10]},[pSrcLeft],step 261 VLD1 {qLeft[11]},[pTmp],step 262 VLD1 {qLeft[12]},[pSrcLeft],step 263 VLD1 {qLeft[13]},[pTmp],step 264 VLD1 {qLeft[14]},[pSrcLeft],step 265 VLD1 {qLeft[15]},[pTmp] 266 267 VPADDL qSum8, qLeft 268 ADD count, count, #1 269 VPADD dSum4, dSum80, dSum81 270 VPADDL dSum2, dSum4 271 VPADDL dSumLeft, dSum2 272 VRSHR dSum, dSumLeft, #4 273 274 UpperOrNoneAvailable 275 TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 276 BEQ BothOrNoneAvailable ;// Jump to Left if not upper 277 VLD1 qAbove, [pSrcAbove] 278 ADD count, count, #1 ;// if upper inc count by 1 279 VPADDL qSum8, qAbove 280 VPADD dSum4, dSum80, dSum81 281 VPADDL dSum2, dSum4 282 VPADDL dSumAbove, dSum2 283 VRSHR dSum, dSumAbove, #4 284 285 BothOrNoneAvailable 286 CMP count, #2 ;// check if both available 287 BNE NoneAvailable 288 VADD dSum, dSumAbove, dSumLeft 289 VRSHR dSum, dSum, #5 290 291 292 NoneAvailable 293 VDUP qOut, dSum0 294 CMP count, #0 ;// check if none available 295 ADD pTmp, pDst, dstStep 296 ADD step, dstStep, dstStep 297 BNE LoopDC 298 VMOV qOut, #128 299 LoopDC 300 VST1 qOut, [pDst], step 301 VST1 qOut, [pTmp], step 302 VST1 qOut, [pDst], step 303 VST1 qOut, [pTmp], step 304 VST1 qOut, [pDst], step 305 VST1 qOut, [pTmp], step 306 VST1 qOut, [pDst], step 307 VST1 qOut, [pTmp], step 308 VST1 qOut, [pDst], step 309 VST1 qOut, [pTmp], step 310 VST1 qOut, [pDst], step 311 VST1 qOut, [pTmp], step 312 VST1 qOut, [pDst], step 313 VST1 qOut, [pTmp], step 314 VST1 qOut, [pDst], step 315 VST1 qOut, [pTmp], step 316 MOV return, #OMX_Sts_NoErr 317 M_EXIT 318 319 OMX_VC_16X16_PLANE 320 LDR pMultTable, =armVCM4P10_MultiplierTable16x16 321 VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 322 VLD1 dAboveLeft[0],[pSrcAboveLeft] 323 ADD pTmp, pSrcLeft, leftStep 324 ADD step, leftStep, leftStep 325 VLD1 {qLeft[0]}, [pSrcLeft],step 326 VLD1 {qLeft[1]}, [pTmp],step 327 VLD1 {qLeft[2]}, [pSrcLeft],step 328 VLD1 {qLeft[3]}, [pTmp],step 329 VLD1 {qLeft[4]}, [pSrcLeft],step 330 VLD1 {qLeft[5]}, [pTmp],step 331 VLD1 {qLeft[6]}, [pSrcLeft],step 332 VLD1 {qLeft[7]}, [pTmp],step 333 VLD1 {qLeft[8]}, [pSrcLeft],step 334 VLD1 {qLeft[9]}, [pTmp],step 335 VLD1 {qLeft[10]}, [pSrcLeft],step 336 VLD1 {qLeft[11]}, [pTmp],step 337 VLD1 {qLeft[12]}, [pSrcLeft],step 338 VLD1 {qLeft[13]}, [pTmp],step 339 VLD1 {qLeft[14]}, [pSrcLeft],step 340 VLD1 {qLeft[15]}, [pTmp] 341 342 VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8] 343 VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] 344 VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X] 345 VSUBL qAboveDiff, dRevAbove, dAbove0 346 347 VSHL dAboveDiff64, dAboveDiff64, #16 348 VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1 349 350 VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8] 351 VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 352 VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X] 353 VSUBL qLeftDiff,dRevLeft, dLeft0 354 355 ;// Multiplier = [8|1|2|...|6|7] 356 VLD1 qMultiplier, [pMultTable]! 357 358 VSHL dLeftDiff64, dLeftDiff64, #16 359 VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1 360 361 VMULL qH,dDiffAbove0, dMultiplier0 362 VMULL qV,dDiffLeft0, dMultiplier0 363 VMLAL qH,dDiffAbove1, dMultiplier1 364 VMLAL qV,dDiffLeft1, dMultiplier1 365 366 VPADD dHV00,dH1,dH0 367 VPADD dHV01,dV1,dV0 368 VPADDL qHV, qHV0 369 VSHL qHV1,qHV,#2 370 VADD qHV,qHV,qHV1 371 372 ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)] 373 VRSHR qHV,qHV,#6 374 375 ;// HV1 = [c*7|b*7] 376 VSHL qHV1,qHV,#3 377 VSUB qHV1,qHV1,qHV 378 379 ;// Multiplier1 = [0|1|2|...|7] 380 VLD1 qMultiplier0, [pMultTable]! 381 VDUP qB, dHV0 382 VDUP qC, dHV1 383 384 VADDL qA,dAbove1,dLeft1 385 VSHL qA,qA, #4 386 VDUP qA,dA1[3] 387 VADD dBPlusCMult7, dHV10, dHV11 388 389 ;// Multiplier1 = [8|9|10|...|15] 390 VLD1 qMultiplier1, [pMultTable] 391 ;// Const = a - 7*(b+c) 392 VDUP qConst, dBPlusCMult7S16[0] 393 VSUB qConst, qA, qConst 394 395 ;// B0 = [0*b|1*b|2*b|3*b|......|7*b] 396 VMUL qB0,qB,qMultiplier0 397 398 ;// B0 = [8*b|9*b|10*b|11*b|....|15*b] 399 VMUL qB1,qB,qMultiplier1 400 401 VADD qSum0, qB0, qConst 402 VADD qSum1, qB1, qConst 403 404 ;// Loops for 16 times 405 LoopPlane 406 ;// (b*x + c*y + C)>>5 407 VQRSHRUN dOut0, qSum0,#5 408 VQRSHRUN dOut1, qSum1,#5 409 SUBS y, y, #1 410 VST1 qOut,[pDst],dstStep 411 VADD qSum0,qSum0,qC 412 VADD qSum1,qSum1,qC 413 BNE LoopPlane 414 415 MOV return, #OMX_Sts_NoErr 416 417 M_END 418 419 ENDIF ;// CortexA8 420 421 END 422 ;----------------------------------------------------------------------------------------------- 423 ; omxVCM4P10_PredictIntra_16x16 ends 424 ;----------------------------------------------------------------------------------------------- 425