1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_PredictIntra_16x16_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 9641 21 ;// Date: Thursday, February 7, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS ARM1136JS 31 32 ;//------------------------------------------------------- 33 ;// This table for implementing switch case of C in asm by 34 ;// the mehtod of two levels of indexing. 35 ;//------------------------------------------------------- 36 37 M_TABLE armVCM4P10_pIndexTable16x16 38 DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 39 DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE 40 41 IF ARM1136JS 42 43 ;//-------------------------------------------- 44 ;// Constants 45 ;//-------------------------------------------- 46 BLK_SIZE EQU 0x10 47 MUL_CONST0 EQU 0x01010101 48 MUL_CONST1 EQU 0x00060004 49 MUL_CONST2 EQU 0x00070005 50 MUL_CONST3 EQU 0x00030001 51 MASK_CONST EQU 0x00FF00FF 52 53 ;//-------------------------------------------- 54 ;// Scratch variable 55 ;//-------------------------------------------- 56 y RN 12 57 pc RN 15 58 59 return RN 0 60 innerCount RN 0 61 outerCount RN 1 62 pSrcLeft2 RN 1 63 pDst2 RN 2 64 sum RN 6 65 pTable RN 9 66 temp1 RN 10 67 temp2 RN 12 68 cMul1 RN 11 69 cMul2 RN 12 70 count RN 12 71 dstStepx2 RN 11 72 leftStepx2 RN 14 73 r0x01010101 RN 10 74 r0x00FF00FF RN 11 75 76 tVal0 RN 0 77 tVal1 RN 1 78 tVal2 RN 2 79 tVal3 RN 3 80 tVal4 RN 4 81 tVal5 RN 5 82 tVal6 RN 6 83 tVal7 RN 7 84 tVal8 RN 8 85 tVal9 RN 9 86 tVal10 RN 10 87 tVal11 RN 11 88 tVal12 RN 12 89 tVal14 RN 14 90 91 b RN 12 92 c RN 14 93 94 p2p0 RN 0 95 p3p1 RN 1 96 p6p4 RN 2 97 p7p5 RN 4 98 p10p8 RN 6 99 p11p9 RN 7 100 p14p12 RN 8 101 p15p13 RN 9 102 103 p3210 RN 10 104 p7654 RN 10 105 p111098 RN 10 106 p15141312 RN 10 107 108 ;//-------------------------------------------- 109 ;// Declare input registers 110 ;//-------------------------------------------- 111 pSrcLeft RN 0 ;// input pointer 112 pSrcAbove RN 1 ;// input pointer 113 pSrcAboveLeft RN 2 ;// input pointer 114 pDst RN 3 ;// output pointer 115 leftStep RN 4 ;// input variable 116 dstStep RN 5 ;// input variable 117 predMode RN 6 ;// input variable 118 availability RN 7 ;// input variable 119 120 ;//----------------------------------------------------------------------------------------------- 121 ;// omxVCM4P10_PredictIntra_16x16 starts 122 ;//----------------------------------------------------------------------------------------------- 123 124 ;// Write function header 125 M_START omxVCM4P10_PredictIntra_16x16, r11 126 127 ;// Define stack arguments 128 M_ARG LeftStep, 4 129 M_ARG DstStep, 4 130 M_ARG PredMode, 4 131 M_ARG Availability, 4 132 133 ;// M_STALL ARM1136JS=4 134 135 LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case 136 137 ;// Load argument from the stack 138 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 139 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 140 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 141 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 142 143 MOV y, #BLK_SIZE ;// Outer Loop Count 144 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 145 146 OMX_VC_16X16_VERT 147 LDM pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15] 148 ADD dstStepx2, dstStep, dstStep ;// double dstStep 149 ADD pDst2, pDst, dstStep ;// pDst2- pDst advanced by dstStep 150 151 ;// M_STALL ARM1136JS=2 ;// Stall outside the loop 152 153 LOOP_VERT 154 STM pDst, {tVal6,tVal7,tVal8,tVal9} ;// pDst[0 to 15] = tVal 6 to 9 155 SUBS y, y, #2 ;// y-- 156 ADD pDst, pDst, dstStepx2 ;// pDst advanced by dstStep 157 STM pDst2, {tVal6,tVal7,tVal8,tVal9} ;// pDst2[16 to 31] = tVal 6 to 9 158 ADD pDst2, pDst2, dstStepx2 ;// pDst advanced by dstStep 159 BNE LOOP_VERT ;// Loop for 8 times 160 MOV return, #OMX_Sts_NoErr 161 M_EXIT 162 163 164 OMX_VC_16X16_HOR 165 166 ;// M_STALL ARM1136JS=6 167 168 LDR r0x01010101, =MUL_CONST0 ;// Const to repeat the byte in reg 4 times 169 MOV y, #4 ;// Outer Loop Count 170 M_LDRB tVal6, [pSrcLeft], +leftStep ;// tVal6 = pSrcLeft[0 to 3] 171 ADD pDst2, pDst, dstStep ;// pDst2- pDst advanced by dstStep 172 M_LDRB tVal7, [pSrcLeft], +leftStep ;// tVal1 = pSrcLeft[4 to 7] 173 ADD dstStepx2, dstStep, dstStep ;// double dstStep 174 SUB dstStepx2, dstStepx2, #12 ;// double dstStep minus 12 175 176 LOOP_HOR 177 M_LDRB tVal8, [pSrcLeft], +leftStep ;// tVal8 = pSrcLeft[0 to 3] 178 MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes 179 M_LDRB tVal9, [pSrcLeft], +leftStep ;// tVal9 = pSrcLeft[4 to 7] 180 MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes 181 SUBS y, y, #1 ;// y-- 182 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst[0 to 3] 183 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 184 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst[4 to 7] 185 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[4 to 7] 186 MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes 187 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst[8 to 11] 188 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[8 to 11] 189 MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes 190 M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst[12 to 15] 191 M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[12 to 15] 192 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst[0 to 3] 193 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 194 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst[4 to 7] 195 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[4 to 7] 196 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst[8 to 11] 197 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[8 to 11] 198 M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst[12 to 15] 199 M_LDRB tVal6, [pSrcLeft], +leftStep ;// tVal6 = pSrcLeft[0 to 3] 200 M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[12 to 15] 201 M_LDRB tVal7, [pSrcLeft], +leftStep ;// tVal7 = pSrcLeft[4 to 7] 202 BNE LOOP_HOR ;// Loop for 3 times 203 MOV return, #OMX_Sts_NoErr 204 M_EXIT 205 206 OMX_VC_16X16_DC 207 208 ;// M_STALL ARM1136JS=2 209 210 MOV count, #0 ;// count = 0 211 TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 212 BEQ TST_LEFT ;// Jump to Left if not upper 213 LDM pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15] 214 ADD count, count, #1 ;// if upper inc count by 1 215 216 ;// M_STALL ARM1136JS=2 217 218 UXTB16 tVal2, tVal8 ;// pSrcAbove[0, 2] 219 UXTB16 tVal6, tVal9 ;// pSrcAbove[4, 6] 220 UADD16 tVal2, tVal2, tVal6 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6] 221 UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3] 222 UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7] 223 UADD16 tVal8, tVal8, tVal9 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7] 224 UADD16 tVal2, tVal2, tVal8 ;// sum(pSrcAbove[0] to pSrcAbove[7]) 225 226 UXTB16 tVal8, tVal10 ;// pSrcAbove[8, 10] 227 UXTB16 tVal9, tVal11 ;// pSrcAbove[12, 14] 228 UADD16 tVal8, tVal8, tVal9 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14] 229 UXTB16 tVal10, tVal10, ROR #8 ;// pSrcAbove[9, 11] 230 UXTB16 tVal11, tVal11, ROR #8 ;// pSrcAbove[13, 15] 231 UADD16 tVal10, tVal10, tVal11 ;// pSrcAbove[9, 11] + pSrcAbove[13, 15] 232 UADD16 tVal8, tVal8, tVal10 ;// sum(pSrcAbove[8] to pSrcAbove[15]) 233 234 UADD16 tVal2, tVal2, tVal8 ;// sum(pSrcAbove[0] to pSrcAbove[15]) 235 236 ;// M_STALL ARM1136JS=1 237 238 ADD tVal2, tVal2, tVal2, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[15]) 239 240 ;// M_STALL ARM1136JS=1 241 242 UXTH sum, tVal2 ;// Extract the lower half for result 243 244 TST_LEFT 245 TST availability, #OMX_VC_LEFT 246 BEQ TST_COUNT 247 ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep 248 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 249 250 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 251 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 252 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 253 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 254 ADD tVal7, tVal8, tVal9 ;// tVal7 = tVal8 + tVal9 255 ADD count, count, #1 ;// Inc Counter if Left is available 256 ADD tVal6, tVal10, tVal11 ;// tVal6 = tVal10 + tVal11 257 258 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 259 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 260 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 261 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 262 ADD sum, tVal7, tVal6 ;// sum = tVal8 + tVal10 263 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 264 ADD tVal10, tVal10, tVal11 ;// tVal10= tVal10 + tVal11 265 ADD tVal7, tVal8, tVal10 ;// tVal7 = tVal8 + tVal10 266 267 268 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 269 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 270 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 271 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 272 ADD sum, sum, tVal7 ;// sum = sum + tVal7 273 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 274 ADD tVal10, tVal10, tVal11 ;// tVal10= tVal10 + tVal11 275 ADD tVal7, tVal8, tVal10 ;// tVal7 = tVal8 + tVal10 276 277 278 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 279 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 280 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 281 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 282 ADD sum, sum, tVal7 ;// sum = sum + tVal7 283 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 284 ADD tVal10, tVal10, tVal11 ;// tVal10= tVal10 + tVal11 285 ADD tVal7, tVal8, tVal10 ;// tVal7 = tVal8 + tVal10 286 ADD sum, sum, tVal7 ;// sum = sum + tVal7 287 288 TST_COUNT 289 CMP count, #0 ;// if(count == 0) 290 MOVEQ sum, #128 ;// sum = 128 if(count == 0) 291 BEQ TST_COUNT0 ;// if(count == 0) 292 CMP count, #1 ;// if(count == 1) 293 ADDEQ sum, sum, #8 ;// sum += 8 if(count == 1) 294 ADDNE sum, sum, tVal2 ;// sum = sumleft + sumupper 295 ADDNE sum, sum, #16 ;// sum += 16 if(count == 2) 296 297 ;// M_STALL ARM1136JS=1 298 299 UXTH sum, sum ;// sum only byte rest cleared 300 301 ;// M_STALL ARM1136JS=1 302 303 LSREQ sum, sum, #4 ;// sum >> 4 if(count == 1) 304 305 ;// M_STALL ARM1136JS=1 306 307 LSRNE sum, sum, #5 ;// sum >> 5 if(count == 2) 308 309 TST_COUNT0 310 311 ;// M_STALL ARM1136JS=1 312 313 ORR sum, sum, sum, LSL #8 ;// sum replicated in two halfword 314 315 ;// M_STALL ARM1136JS=1 316 317 ORR tVal6, sum, sum, LSL #16 ;// sum replicated in all bytes 318 CPY tVal7, tVal6 ;// tVal1 = tVal0 319 CPY tVal8, tVal6 ;// tVal2 = tVal0 320 CPY tVal9, tVal6 ;// tVal3 = tVal0 321 ADD dstStepx2, dstStep, dstStep ;// double dstStep 322 ADD pDst2, pDst, dstStep ;// pDst2- pDst advanced by dstStep 323 MOV y, #BLK_SIZE ;// Outer Loop Count 324 325 LOOP_DC 326 STM pDst, {tVal6,tVal7,tVal8,tVal9} ;// pDst[0 to 15] = tVal 6 to 9 327 SUBS y, y, #2 ;// y-- 328 ADD pDst, pDst, dstStepx2 ;// pDst advanced by dstStep 329 STM pDst2, {tVal6,tVal7,tVal8,tVal9} ;// pDst2[16 to 31] = tVal 6 to 9 330 ADD pDst2, pDst2, dstStepx2 ;// pDst advanced by dstStep 331 BNE LOOP_DC ;// Loop for 8 times 332 333 MOV return, #OMX_Sts_NoErr 334 M_EXIT 335 336 OMX_VC_16X16_PLANE 337 338 ;// M_STALL ARM1136JS=3 339 RSB tVal14, leftStep, leftStep, LSL #4 ;// tVal14 = 15*leftStep 340 341 ;// M_STALL ARM1136JS=2 342 LDRB tVal10, [pSrcLeft, tVal14] ;// tVal10 = pSrcLeft[15*leftStep] 343 LDRB tVal11, [pSrcAboveLeft] ;// tVal11 = pSrcAboveLeft[0] 344 LDRB tVal12, [pSrcAbove, #15] 345 346 ADD tVal2, tVal12, tVal10 ;// tVal2 = pSrcAbove[15] + pSrcLeft[15*leftStep] 347 SUB tVal10, tVal10, tVal11 ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0] 348 SUB tVal11, tVal12, tVal11 ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0] 349 MOV tVal2, tVal2, LSL #4 ;// tVal2 = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep]) 350 351 MOV tVal11, tVal11, LSL #3 ;// 8*[15]-[-1] 352 LDRB tVal6, [pSrcAbove, #0] 353 LDRB tVal7, [pSrcAbove, #14] 354 SUB tVal8, tVal7, tVal6 355 RSB tVal8, tVal8, tVal8, LSL #3 ;// 7*[14]-[0] 356 ADD tVal11, tVal11, tVal8 357 LDRB tVal6, [pSrcAbove, #1] 358 LDRB tVal7, [pSrcAbove, #13] 359 SUB tVal8, tVal7, tVal6 360 ADD tVal8, tVal8, tVal8 361 ADD tVal8, tVal8, tVal8, LSL #1 ;// 6*[13]-[1] 362 ADD tVal11, tVal11, tVal8 363 LDRB tVal6, [pSrcAbove, #2] 364 LDRB tVal7, [pSrcAbove, #12] 365 SUB tVal8, tVal7, tVal6 366 ADD tVal8, tVal8, tVal8, LSL #2 ;// 5*[12]-[2] 367 ADD tVal11, tVal11, tVal8 368 LDRB tVal6, [pSrcAbove, #3] 369 LDRB tVal7, [pSrcAbove, #11] 370 SUB tVal8, tVal7, tVal6 371 ADD tVal11, tVal11, tVal8, LSL #2 ;// + 4*[11]-[3] 372 LDRB tVal6, [pSrcAbove, #4] 373 LDRB tVal7, [pSrcAbove, #10] 374 SUB tVal8, tVal7, tVal6 375 ADD tVal8, tVal8, tVal8, LSL #1 ;// 3*[10]-[4] 376 ADD tVal11, tVal11, tVal8 377 LDRB tVal6, [pSrcAbove, #5] 378 LDRB tVal7, [pSrcAbove, #9] 379 SUB tVal8, tVal7, tVal6 380 ADD tVal11, tVal11, tVal8, LSL #1 ;// + 2*[9]-[5] 381 LDRB tVal6, [pSrcAbove, #6] 382 LDRB tVal7, [pSrcAbove, #8] 383 SUB tVal8, tVal7, tVal6 ;// 1*[8]-[6] 384 ADD tVal7, tVal11, tVal8 385 386 ADD tVal2, tVal2, #16 ;// tVal2 = a + 16 387 MOV tVal1, pSrcLeft ;// tVal4 = pSrcLeft 388 SUB tVal9, tVal14, leftStep ;// tVal9 = 14*leftStep 389 ADD tVal9, pSrcLeft, tVal9 ;// tVal9 = pSrcLeft + 14*leftStep 390 391 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[14*leftStep] 392 M_LDRB tVal11, [tVal1], +leftStep ;// tVal11 = pSrcLeft[0] 393 ADD tVal7, tVal7, tVal7, LSL #2 ;// tVal7 = 5 * H 394 ADD tVal7, tVal7, #32 ;// tVal7 = 5 * H + 32 395 SUB tVal8, tVal8, tVal11 ;// tVal8 = pSrcLeft[14*leftStep] - pSrcLeft[0] 396 ASR tVal12, tVal7, #6 ;// tVal12 = b = (5 * H + 32) >> 6 397 398 RSB tVal8, tVal8, tVal8, LSL #3 ;// tVal8 = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0]) 399 ADD tVal6, tVal8, tVal10, LSL #3 ;// tVal6 = V = V0 +V1 400 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[13*leftStep] 401 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[leftStep] 402 RSB tVal7, tVal12, tVal12, LSL #3 ;// tVal7 = 7*b 403 SUB tVal2, tVal2, tVal7 ;// tVal2 = a + 16 - 7*b 404 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[13*leftStep] - pSrcLeft[leftStep] 405 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[12*lS] 406 ADD tVal7, tVal7, tVal7 ;// tVal7 = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep]) 407 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[2*leftStep] 408 ADD tVal7, tVal7, tVal7, LSL #1 ;// tVal7 = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep]) 409 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V2 410 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep] 411 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[11*leftStep] 412 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[3*leftStep] 413 ADD tVal7, tVal7, tVal7, LSL #2 ;// tVal7 = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]) 414 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V3 415 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep] 416 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[10*leftStep] 417 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[4*leftStep] 418 ADD tVal6, tVal6, tVal7, LSL #2 ;// tVal6 = V = V + V4 419 SUB dstStep, dstStep, #16 ;// tVal5 = dstStep - 16 420 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep] 421 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[9*leftStep] 422 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[5*leftStep] 423 ADD tVal7, tVal7, tVal7, LSL #1 ;// tVal7 = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]) 424 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V5 425 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep] 426 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[8*leftStep] 427 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[6*leftStep] 428 ADD tVal6, tVal6, tVal7, LSL #1 ;// tVal6 = V = V + V6 429 430 ;// M_STALL ARM1136JS=1 431 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep] 432 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V7 433 434 ;// M_STALL ARM1136JS=1 435 ADD tVal6, tVal6, tVal6, LSL #2 ;// tVal6 = 5*V 436 ADD tVal6, tVal6, #32 ;// tVal6 = 5*V + 32 437 438 ;// M_STALL ARM1136JS=1 439 ASR tVal14, tVal6, #6 ;// tVal14 = c = (5*V + 32)>>6 440 441 ;// M_STALL ARM1136JS=1 442 RSB tVal6, tVal14, tVal14, LSL #3 ;// tVal6 = 7*c 443 UXTH tVal14, tVal14 ;// tVal14 = Cleared the upper half word 444 ADD tVal10, tVal12, tVal12 ;// tVal10 = 2*b 445 ORR tVal14, tVal14, tVal14, LSL #16 ;// tVal14 = {c , c} 446 SUB tVal6, tVal2, tVal6 ;// tVal6 = d = a - 7*b - 7*c + 16 447 ADD tVal1, tVal6, tVal10 ;// tVal1 = pp2 = d + 2*b 448 ADD tVal10, tVal10, tVal12 ;// tVal10 =3*b 449 ORR tVal0, tVal6, tVal1, LSL #16 ;// tval0 = p2p0 = pack {p2, p0} 450 UXTH tVal12, tVal12 ;// tVal12 = Cleared the upper half word 451 UXTH tVal10, tVal10 ;// tVal12 = Cleared the upper half word 452 ORR tVal12, tVal12, tVal12, LSL #16 ;// tVal12 = {b , b} 453 ORR tVal10, tVal10, tVal10, LSL #16 ;// tVal10 = {3b , 3b} 454 SADD16 tVal1, tVal0, tVal12 ;// tVal1 = p3p1 = p2p0 + {b,b} 455 SADD16 tVal2, tVal1, tVal10 ;// tVal2 = p6p4 = p3p1 + {3b,3b} 456 SADD16 tVal4, tVal2, tVal12 ;// tVal4 = p7p5 = p6p4 + {b,b} 457 SADD16 tVal6, tVal4, tVal10 ;// tVal6 = p10p8 = p7p5 + {3b,3b} 458 SADD16 tVal7, tVal6, tVal12 ;// tVal7 = p11p9 = p10p8 + {b,b} 459 SADD16 tVal8, tVal7, tVal10 ;// tVal8 = p14p12 = p11p9 + {3b,3b} 460 SADD16 tVal9, tVal8, tVal12 ;// tVal9 = p15p13 = p14p12 + {b,b} 461 LDR r0x00FF00FF, =MASK_CONST ;// r0x00FF00FF = 0x00FF00FF 462 463 LOOP_PLANE 464 465 USAT16 temp2, #13, p3p1 466 USAT16 temp1, #13, p2p0 467 SADD16 p3p1, p3p1, c 468 SADD16 p2p0, p2p0, c 469 AND temp2, r0x00FF00FF, temp2, ASR #5 470 AND temp1, r0x00FF00FF, temp1, ASR #5 471 ORR temp1, temp1, temp2, LSL #8 472 STR temp1, [pDst], #4 473 474 USAT16 temp2, #13, p7p5 475 USAT16 temp1, #13, p6p4 476 SADD16 p7p5, p7p5, c 477 SADD16 p6p4, p6p4, c 478 AND temp2, r0x00FF00FF, temp2, ASR #5 479 AND temp1, r0x00FF00FF, temp1, ASR #5 480 ORR temp1, temp1, temp2, LSL #8 481 STR temp1, [pDst], #4 482 483 USAT16 temp2, #13, p11p9 484 USAT16 temp1, #13, p10p8 485 SADD16 p11p9, p11p9, c 486 SADD16 p10p8, p10p8, c 487 AND temp2, r0x00FF00FF, temp2, ASR #5 488 AND temp1, r0x00FF00FF, temp1, ASR #5 489 ORR temp1, temp1, temp2, LSL #8 490 STR temp1, [pDst], #4 491 492 USAT16 temp2, #13, p15p13 493 USAT16 temp1, #13, p14p12 494 SADD16 p15p13, p15p13, c 495 SADD16 p14p12, p14p12, c 496 AND temp2, r0x00FF00FF, temp2, ASR #5 497 AND temp1, r0x00FF00FF, temp1, ASR #5 498 ORR temp1, temp1, temp2, LSL #8 499 STR temp1, [pDst], #4 500 501 ADDS r0x00FF00FF, r0x00FF00FF, #1<<28 ;// Loop counter value in top 4 bits 502 503 ADD pDst, pDst, dstStep 504 505 BCC LOOP_PLANE ;// Loop for 16 times 506 MOV return, #OMX_Sts_NoErr 507 M_END 508 509 ENDIF ;// ARM1136JS 510 511 512 END 513 ;----------------------------------------------------------------------------------------------- 514 ; omxVCM4P10_PredictIntra_16x16 ends 515 ;----------------------------------------------------------------------------------------------- 516