1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_PredictIntra_16x16_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 33 ;//------------------------------------------------------- 34 ;// This table for implementing switch case of C in asm by 35 ;// the mehtod of two levels of indexing. 36 ;//------------------------------------------------------- 37 38 M_TABLE armVCM4P10_pIndexTable16x16 39 DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 40 DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE 41 42 43 IF CortexA8 44 45 M_TABLE armVCM4P10_MultiplierTable16x16,1 46 DCW 7, 6, 5, 4, 3, 2, 1, 8 47 DCW 0, 1, 2, 3, 4, 5, 6, 7 48 DCW 8, 9, 10, 11, 12, 13, 14, 15 49 50 ;//-------------------------------------------- 51 ;// Constants 52 ;//-------------------------------------------- 53 BLK_SIZE EQU 0x10 54 MUL_CONST0 EQU 0x01010101 55 MUL_CONST1 EQU 0x00060004 56 MUL_CONST2 EQU 0x00070005 57 MUL_CONST3 EQU 0x00030001 58 MASK_CONST EQU 0x00FF00FF 59 60 ;//-------------------------------------------- 61 ;// Scratch variable 62 ;//-------------------------------------------- 63 y RN 12 64 pc RN 15 65 66 return RN 0 67 pTable RN 9 68 count RN 11 69 pMultTable RN 9 70 ; ---------------------------------------------- 71 ; Neon registers 72 ; ---------------------------------------------- 73 qAbove QN Q0.U8 74 qLeft QN Q1.U8 75 qSum8 QN Q0.U16 76 dSum80 DN D0.U16 77 dSum81 DN D1.U16 78 dSum4 DN D0.U16 79 dSum2 DN D0.U32 80 dSum1 DN D0.U64 81 qOut QN Q3.U8 82 dSumLeft DN D6.U64 83 dSumAbove DN D7.U64 84 dSum DN D8.U64 85 dSum0 DN D8.U8[0] 86 87 qH QN Q11.S32 88 qV QN Q12.S32 89 qA QN Q11.S16 90 qB QN Q6.S16 91 qC QN Q7.S16 92 93 qB0 QN Q5.S16 94 qB1 QN Q6.S16 95 dA1 DN D23.S16 96 97 dH0 DN D22.S32 98 dH1 DN D23.S32 99 dV0 DN D24.S32 100 dV1 DN D25.S32 101 102 qHV QN Q11.S64 103 qHV0 QN Q11.S32 104 qHV1 QN Q12.S64 105 106 dHV00 DN D22.S32 107 dHV01 DN D23.S32 108 109 dHV0 DN D22.S16[0] 110 dHV1 DN D23.S16[0] 111 dHV10 DN D24.S64 112 dHV11 DN D25.S64 113 114 qSum0 QN Q0.S16 115 qSum1 QN Q1.S16 116 117 dOut0 DN D6.U8 118 dOut1 DN D7.U8 119 120 dLeft0 DN D2.U8 121 dLeft1 DN D3.U8 122 qConst QN Q13.S16 123 124 dAbove0 DN D0.U8 125 dAbove1 DN D1.U8 126 127 dRevLeft64 DN D12.U64 128 dRevLeft DN D12.U8 129 dRevAbove64 DN D5.U64 130 dRevAbove DN D5.U8 131 qLeftDiff QN Q8.S16 132 dLeftDiff1 DN D17.S16 133 dLeftDiff64 DN D17.S64 134 qDiffLeft QN Q8.S16 135 qDiffAbove QN Q4.S16 136 dAboveDiff1 DN D9.S16 137 dAboveDiff64 DN D9.S64 138 qAboveDiff QN Q4.S16 139 140 dAboveLeft DN D4.U8 141 142 dDiffLeft0 DN D16.S16 143 dDiffLeft1 DN D17.S16 144 dDiffAbove0 DN D8.S16 145 dDiffAbove1 DN D9.S16 146 147 qLeft15minus0 QN Q7.S16 148 dLeft15minus0 DN D14.S16 149 qAbove15minus0 QN Q3.S16 150 dAbove15minus0 DN D6.S16 151 152 qMultiplier QN Q10.S16 153 qMultiplier0 QN Q10.S16 154 qMultiplier1 QN Q12.S16 155 dMultiplier0 DN D20.S16 156 dMultiplier1 DN D21.S16 157 158 dBPlusCMult7 DN D1.S64 159 dBPlusCMult7S16 DN D1.S16 160 161 qTmp QN Q0.U8 162 163 ;//-------------------------------------------- 164 ;// Declare input registers 165 ;//-------------------------------------------- 166 pSrcLeft RN 0 ;// input pointer 167 pSrcAbove RN 1 ;// input pointer 168 pSrcAboveLeft RN 2 ;// input pointer 169 pDst RN 3 ;// output pointer 170 leftStep RN 4 ;// input variable 171 dstStep RN 5 ;// input variable 172 predMode RN 6 ;// input variable 173 availability RN 7 ;// input variable 174 175 pTmp RN 8 176 step RN 10 177 pTmp2 RN 11 178 179 ;//----------------------------------------------------------------------------------------------- 180 ;// omxVCM4P10_PredictIntra_16x16 starts 181 ;//----------------------------------------------------------------------------------------------- 182 183 ;// Write function header 184 M_START omxVCM4P10_PredictIntra_16x16, r11, d15 185 186 ;// Define stack arguments 187 M_ARG LeftStep, 4 188 M_ARG DstStep, 4 189 M_ARG PredMode, 4 190 M_ARG Availability, 4 191 192 ;// M_STALL ARM1136JS=4 193 194 LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case 195 196 ;// Load argument from the stack 197 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 198 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 199 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 200 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 201 202 MOV y, #BLK_SIZE ;// Outer Loop Count 203 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 204 205 OMX_VC_16X16_VERT 206 VLD1 qAbove, [pSrcAbove] 207 ADD pTmp, pDst, dstStep 208 ADD step, dstStep, dstStep 209 VST1 qAbove, [pDst], step 210 VST1 qAbove, [pTmp], step 211 VST1 qAbove, [pDst], step 212 VST1 qAbove, [pTmp], step 213 VST1 qAbove, [pDst], step 214 VST1 qAbove, [pTmp], step 215 VST1 qAbove, [pDst], step 216 VST1 qAbove, [pTmp], step 217 VST1 qAbove, [pDst], step 218 VST1 qAbove, [pTmp], step 219 VST1 qAbove, [pDst], step 220 VST1 qAbove, [pTmp], step 221 VST1 qAbove, [pDst], step 222 VST1 qAbove, [pTmp], step 223 VST1 qAbove, [pDst] 224 VST1 qAbove, [pTmp] 225 MOV return, #OMX_Sts_NoErr ;// returnNoError 226 M_EXIT 227 228 OMX_VC_16X16_HOR 229 ADD pTmp, pSrcLeft, leftStep 230 ADD leftStep, leftStep, leftStep 231 ADD pTmp2, pDst, dstStep 232 ADD dstStep, dstStep, dstStep 233 LoopHor 234 VLD1 {qLeft[]}, [pSrcLeft], leftStep 235 VLD1 {qTmp[]}, [pTmp], leftStep 236 SUBS y, y, #8 237 VST1 qLeft, [pDst], dstStep 238 VST1 qTmp, [pTmp2], dstStep 239 VLD1 {qLeft[]}, [pSrcLeft], leftStep 240 VLD1 {qTmp[]}, [pTmp], leftStep 241 VST1 qLeft, [pDst], dstStep 242 VST1 qTmp, [pTmp2], dstStep 243 VLD1 {qLeft[]}, [pSrcLeft], leftStep 244 VLD1 {qTmp[]}, [pTmp], leftStep 245 VST1 qLeft, [pDst], dstStep 246 VST1 qTmp, [pTmp2], dstStep 247 VLD1 {qLeft[]}, [pSrcLeft], leftStep 248 VLD1 {qTmp[]}, [pTmp], leftStep 249 VST1 qLeft, [pDst], dstStep 250 VST1 qTmp, [pTmp2], dstStep 251 252 BNE LoopHor ;// Loop for 16 times 253 MOV return, #OMX_Sts_NoErr 254 M_EXIT 255 256 OMX_VC_16X16_DC 257 MOV count, #0 ;// count = 0 258 TST availability, #OMX_VC_LEFT 259 BEQ UpperOrNoneAvailable ;// Jump to Upper if not left 260 261 ADD pTmp, pSrcLeft, leftStep 262 ADD step, leftStep, leftStep 263 264 VLD1 {qLeft[0]}, [pSrcLeft],step 265 VLD1 {qLeft[1]}, [pTmp],step 266 VLD1 {qLeft[2]}, [pSrcLeft],step 267 VLD1 {qLeft[3]}, [pTmp],step 268 VLD1 {qLeft[4]}, [pSrcLeft],step 269 VLD1 {qLeft[5]}, [pTmp],step 270 VLD1 {qLeft[6]}, [pSrcLeft],step 271 VLD1 {qLeft[7]}, [pTmp],step 272 VLD1 {qLeft[8]}, [pSrcLeft],step 273 VLD1 {qLeft[9]}, [pTmp],step 274 VLD1 {qLeft[10]},[pSrcLeft],step 275 VLD1 {qLeft[11]},[pTmp],step 276 VLD1 {qLeft[12]},[pSrcLeft],step 277 VLD1 {qLeft[13]},[pTmp],step 278 VLD1 {qLeft[14]},[pSrcLeft],step 279 VLD1 {qLeft[15]},[pTmp] 280 281 VPADDL qSum8, qLeft 282 ADD count, count, #1 283 VPADD dSum4, dSum80, dSum81 284 VPADDL dSum2, dSum4 285 VPADDL dSumLeft, dSum2 286 VRSHR dSum, dSumLeft, #4 287 288 UpperOrNoneAvailable 289 TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 290 BEQ BothOrNoneAvailable ;// Jump to Left if not upper 291 VLD1 qAbove, [pSrcAbove] 292 ADD count, count, #1 ;// if upper inc count by 1 293 VPADDL qSum8, qAbove 294 VPADD dSum4, dSum80, dSum81 295 VPADDL dSum2, dSum4 296 VPADDL dSumAbove, dSum2 297 VRSHR dSum, dSumAbove, #4 298 299 BothOrNoneAvailable 300 CMP count, #2 ;// check if both available 301 BNE NoneAvailable 302 VADD dSum, dSumAbove, dSumLeft 303 VRSHR dSum, dSum, #5 304 305 306 NoneAvailable 307 VDUP qOut, dSum0 308 CMP count, #0 ;// check if none available 309 ADD pTmp, pDst, dstStep 310 ADD step, dstStep, dstStep 311 BNE LoopDC 312 VMOV qOut, #128 313 LoopDC 314 VST1 qOut, [pDst], step 315 VST1 qOut, [pTmp], step 316 VST1 qOut, [pDst], step 317 VST1 qOut, [pTmp], step 318 VST1 qOut, [pDst], step 319 VST1 qOut, [pTmp], step 320 VST1 qOut, [pDst], step 321 VST1 qOut, [pTmp], step 322 VST1 qOut, [pDst], step 323 VST1 qOut, [pTmp], step 324 VST1 qOut, [pDst], step 325 VST1 qOut, [pTmp], step 326 VST1 qOut, [pDst], step 327 VST1 qOut, [pTmp], step 328 VST1 qOut, [pDst], step 329 VST1 qOut, [pTmp], step 330 MOV return, #OMX_Sts_NoErr 331 M_EXIT 332 333 OMX_VC_16X16_PLANE 334 LDR pMultTable, =armVCM4P10_MultiplierTable16x16 335 VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 336 VLD1 dAboveLeft[0],[pSrcAboveLeft] 337 ADD pTmp, pSrcLeft, leftStep 338 ADD step, leftStep, leftStep 339 VLD1 {qLeft[0]}, [pSrcLeft],step 340 VLD1 {qLeft[1]}, [pTmp],step 341 VLD1 {qLeft[2]}, [pSrcLeft],step 342 VLD1 {qLeft[3]}, [pTmp],step 343 VLD1 {qLeft[4]}, [pSrcLeft],step 344 VLD1 {qLeft[5]}, [pTmp],step 345 VLD1 {qLeft[6]}, [pSrcLeft],step 346 VLD1 {qLeft[7]}, [pTmp],step 347 VLD1 {qLeft[8]}, [pSrcLeft],step 348 VLD1 {qLeft[9]}, [pTmp],step 349 VLD1 {qLeft[10]}, [pSrcLeft],step 350 VLD1 {qLeft[11]}, [pTmp],step 351 VLD1 {qLeft[12]}, [pSrcLeft],step 352 VLD1 {qLeft[13]}, [pTmp],step 353 VLD1 {qLeft[14]}, [pSrcLeft],step 354 VLD1 {qLeft[15]}, [pTmp] 355 356 VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8] 357 VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] 358 VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X] 359 VSUBL qAboveDiff, dRevAbove, dAbove0 360 361 VSHL dAboveDiff64, dAboveDiff64, #16 362 VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1 363 364 VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8] 365 VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 366 VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X] 367 VSUBL qLeftDiff,dRevLeft, dLeft0 368 369 ;// Multiplier = [8|1|2|...|6|7] 370 VLD1 qMultiplier, [pMultTable]! 371 372 VSHL dLeftDiff64, dLeftDiff64, #16 373 VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1 374 375 VMULL qH,dDiffAbove0, dMultiplier0 376 VMULL qV,dDiffLeft0, dMultiplier0 377 VMLAL qH,dDiffAbove1, dMultiplier1 378 VMLAL qV,dDiffLeft1, dMultiplier1 379 380 VPADD dHV00,dH1,dH0 381 VPADD dHV01,dV1,dV0 382 VPADDL qHV, qHV0 383 VSHL qHV1,qHV,#2 384 VADD qHV,qHV,qHV1 385 386 ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)] 387 VRSHR qHV,qHV,#6 388 389 ;// HV1 = [c*7|b*7] 390 VSHL qHV1,qHV,#3 391 VSUB qHV1,qHV1,qHV 392 393 ;// Multiplier1 = [0|1|2|...|7] 394 VLD1 qMultiplier0, [pMultTable]! 395 VDUP qB, dHV0 396 VDUP qC, dHV1 397 398 VADDL qA,dAbove1,dLeft1 399 VSHL qA,qA, #4 400 VDUP qA,dA1[3] 401 VADD dBPlusCMult7, dHV10, dHV11 402 403 ;// Multiplier1 = [8|9|10|...|15] 404 VLD1 qMultiplier1, [pMultTable] 405 ;// Const = a - 7*(b+c) 406 VDUP qConst, dBPlusCMult7S16[0] 407 VSUB qConst, qA, qConst 408 409 ;// B0 = [0*b|1*b|2*b|3*b|......|7*b] 410 VMUL qB0,qB,qMultiplier0 411 412 ;// B0 = [8*b|9*b|10*b|11*b|....|15*b] 413 VMUL qB1,qB,qMultiplier1 414 415 VADD qSum0, qB0, qConst 416 VADD qSum1, qB1, qConst 417 418 ;// Loops for 16 times 419 LoopPlane 420 ;// (b*x + c*y + C)>>5 421 VQRSHRUN dOut0, qSum0,#5 422 VQRSHRUN dOut1, qSum1,#5 423 SUBS y, y, #1 424 VST1 qOut,[pDst],dstStep 425 VADD qSum0,qSum0,qC 426 VADD qSum1,qSum1,qC 427 BNE LoopPlane 428 429 MOV return, #OMX_Sts_NoErr 430 431 M_END 432 433 ENDIF ;// CortexA8 434 435 END 436 ;----------------------------------------------------------------------------------------------- 437 ; omxVCM4P10_PredictIntra_16x16 ends 438 ;----------------------------------------------------------------------------------------------- 439