1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 9641 6 ;// Date: Thursday, February 7, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 14 INCLUDE omxtypes_s.h 15 INCLUDE armCOMM_s.h 16 17 EXPORT armVCM4P10_pIndexTable8x8 18 19 ;// Define the processor variants supported by this file 20 21 M_VARIANTS ARM1136JS 22 23 AREA table, DATA 24 ;//------------------------------------------------------- 25 ;// This table for implementing switch case of C in asm by 26 ;// the mehtod of two levels of indexing. 27 ;//------------------------------------------------------- 28 29 M_TABLE armVCM4P10_pIndexTable8x8 30 DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR 31 DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE 32 33 M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 34 DCW 3, 2, 1,4 35 DCW -3,-2,-1,0 36 DCW 1, 2, 3,4 37 38 IF ARM1136JS 39 40 ;//-------------------------------------------- 41 ;// Constants 42 ;//-------------------------------------------- 43 44 BLK_SIZE EQU 0x8 45 MUL_CONST0 EQU 0x01010101 46 MASK_CONST EQU 0x00FF00FF 47 MUL_CONST1 EQU 0x80808080 48 49 ;//-------------------------------------------- 50 ;// Scratch variable 51 ;//-------------------------------------------- 52 y RN 12 53 pc RN 15 54 return RN 0 55 pSrcLeft2 RN 1 56 pDst2 RN 2 57 sum1 RN 6 58 sum2 RN 7 59 pTable RN 9 60 dstStepx2 RN 11 61 leftStepx2 RN 14 62 outerCount RN 14 63 r0x01010101 RN 10 64 r0x00FF00FF RN 11 65 66 tVal0 RN 0 67 tVal1 RN 1 68 tVal2 RN 2 69 tVal3 RN 3 70 tVal4 RN 4 71 tVal5 RN 5 72 tVal6 RN 6 73 tVal7 RN 7 74 tVal8 RN 8 75 tVal9 RN 9 76 tVal10 RN 10 77 tVal11 RN 11 78 tVal12 RN 12 79 tVal14 RN 14 80 81 b RN 14 82 c RN 12 83 84 p2p0 RN 0 85 p3p1 RN 1 86 p6p4 RN 2 87 p7p5 RN 4 88 89 pp2pp0 RN 6 90 pp3pp1 RN 7 91 pp6pp4 RN 8 92 pp7pp5 RN 9 93 94 p3210 RN 10 95 p7654 RN 10 96 97 ;//-------------------------------------------- 98 ;// Input Arguments 99 ;//-------------------------------------------- 100 pSrcLeft RN 0 ;// input pointer 101 pSrcAbove RN 1 ;// input pointer 102 pSrcAboveLeft RN 2 ;// input pointer 103 pDst RN 3 ;// output pointer 104 leftStep RN 4 ;// input variable 105 dstStep RN 5 ;// input variable 106 predMode RN 6 ;// input variable 107 availability RN 7 ;// input variable 108 109 ;//----------------------------------------------------------------------------------------------- 110 ;// omxVCM4P10_PredictIntraChroma_8x8 starts 111 ;//----------------------------------------------------------------------------------------------- 112 113 ;// Write function header 114 M_START omxVCM4P10_PredictIntraChroma_8x8, r11 115 116 ;// Define stack arguments 117 M_ARG LeftStep, 4 118 M_ARG DstStep, 4 119 M_ARG PredMode, 4 120 M_ARG Availability, 4 121 122 ;// M_STALL ARM1136JS=4 123 124 LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case 125 126 127 ;// Load argument from the stack 128 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 129 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 130 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 131 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 132 133 MOV y, #BLK_SIZE ;// Outer Loop Count 134 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 135 136 OMX_VC_CHROMA_DC 137 AND availability, availability,#(OMX_VC_UPPER + OMX_VC_LEFT) 138 CMP availability, #(OMX_VC_UPPER + OMX_VC_LEFT) ;// if(availability & (#OMX_VC_UPPER | #OMX_VC_LEFT)) 139 LDR r0x01010101, =MUL_CONST0 140 BNE TST_UPPER ;// Jump to Upper if not both 141 LDM pSrcAbove,{tVal8,tVal9} ;// tVal 8 to 9 = pSrcAbove[0 to 7] 142 143 ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep 144 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 145 146 ;// M_STALL ARM1136JS=1 147 148 UXTB16 tVal7, tVal8 ;// pSrcAbove[0, 2] 149 UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3] 150 UADD16 sum1, tVal7, tVal8 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3] 151 152 UXTB16 tVal7, tVal9 ;// pSrcAbove[4, 6] 153 UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7] 154 UADD16 sum2, tVal7, tVal9 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6] 155 ADD sum1, sum1, sum1, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[3]) 156 ADD sum2, sum2, sum2, LSR #16 ;// sum(pSrcAbove[4] to pSrcAbove[7]) 157 UXTH sum1, sum1 ;// upsum1 (Clear the top junk bits) 158 UXTH sum2, sum2 ;// upsum2 (Clear the top junk bits) 159 160 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 161 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 162 M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[2] 163 M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[3] 164 ADD tVal2, tVal8, tVal9 ;// tVal14 = tVal8 + tVal9 165 166 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[4] 167 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[5] 168 ADD tVal14, tVal4, tVal12 ;// tVal14 = tVal4 + tVal12 169 170 LDRB tVal4, [pSrcLeft] ;// tVal4 = pSrcLeft[6] 171 LDRB tVal12,[pSrcLeft2] ;// tVal12= pSrcLeft[7] 172 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 173 ADD tVal2, tVal2, tVal14 ;// leftsum1 = sum(pSrcLeft[0] to pSrcLeft[3]) 174 ADD tVal4, tVal4, tVal12 ;// tVal4 = tVal4 + tVal12 175 ADD tVal14, tVal8, tVal4 ;// leftsum2 = sum(pSrcLeft[4] to pSrcLeft[7]) 176 ADD tVal8, tVal14, #2 ;// tVal8 = leftsum2 + 2 177 ADD tVal9, sum2, #2 ;// tVal8 = upsum2 + 2 178 ADD sum1, sum1, tVal2 ;// sum1 = upsum1 + leftsum1 179 ADD sum2, sum2, tVal14 ;// sum2 = upsum2 + leftsum2 180 ADD sum1, sum1, #4 ;// (sum1 + 4) 181 ADD sum2, sum2, #4 ;// (sum2 + 4) 182 MOV sum1, sum1, LSR #3 ;// (sum1 + 4)>>3 183 MOV tVal9, tVal9, LSR #2 ;// (tVal9 + 2)>>2 184 MOV tVal8, tVal8, LSR #2 ;// (tVal8 + 2)>>2 185 MOV sum2, sum2, LSR #3 ;// (sum2 + 4)>>3 186 187 MUL tVal0, sum1, r0x01010101 ;// replicate the val in all the bytes 188 MUL tVal1, tVal9,r0x01010101 ;// replicate the val in all the bytes 189 MUL tVal8, tVal8,r0x01010101 ;// replicate the val in all the bytes 190 MUL tVal9, sum2, r0x01010101 ;// replicate the val in all the bytes 191 192 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[0 to 7] = tVal 0 to 1 193 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[8 to 15] = tVal 0 to 1 194 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[16 to 23] = tVal 0 to 1 195 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[24 to 31] = tVal 0 to 1 196 197 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[32 to 39] = tVal 8 to 9 198 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[40 to 47] = tVal 8 to 9 199 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[48 to 55] = tVal 8 to 9 200 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[56 to 63] = tVal 8 to 9 201 MOV return, #OMX_Sts_NoErr 202 M_EXIT 203 204 TST_UPPER 205 206 ;// M_STALL ARM1136JS=3 207 208 CMP availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 209 210 BNE TST_LEFT ;// Jump to Left if not upper 211 LDM pSrcAbove,{tVal8,tVal9} ;// tVal 8 to 9 = pSrcAbove[0 to 7] 212 213 ;// M_STALL ARM1136JS=3 214 215 UXTB16 tVal7, tVal8 ;// pSrcAbove[0, 2] 216 UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3] 217 UADD16 sum1, tVal7, tVal8 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3] 218 219 UXTB16 tVal7, tVal9 ;// pSrcAbove[4, 6] 220 UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7] 221 UADD16 sum2, tVal7, tVal9 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6] 222 223 ADD sum1, sum1, sum1, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[3]) 224 ADD sum2, sum2, sum2, LSR #16 ;// sum(pSrcAbove[4] to pSrcAbove[7]) 225 226 UXTH sum1, sum1 ;// upsum1 (Clear the top junk bits) 227 UXTH sum2, sum2 ;// upsum2 (Clear the top junk bits) 228 229 ADD sum1, sum1, #2 ;// sum1 + 2 230 ADD sum2, sum2, #2 ;// sum2 + 2 231 232 MOV sum1, sum1, LSR #2 ;// (sum1 + 2)>>2 233 MOV sum2, sum2, LSR #2 ;// (sum2 + 2)>>2 234 235 MUL sum1, sum1,r0x01010101 ;// replicate the val in all the bytes 236 MUL sum2, sum2,r0x01010101 ;// replicate the val in all the bytes 237 238 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 239 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 240 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 241 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 242 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7 243 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7 244 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7 245 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7 246 MOV return, #OMX_Sts_NoErr 247 M_EXIT 248 249 TST_LEFT 250 ;// M_STALL ARM1136JS=3 251 252 CMP availability, #OMX_VC_LEFT 253 BNE TST_COUNT0 254 ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep 255 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 256 257 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 258 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 259 M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[2] 260 M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[3] 261 262 ADD tVal6, tVal8, tVal9 ;// tVal6 = tVal8 + tVal9 263 264 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[4] 265 ADD tVal7, tVal4, tVal12 ;// tVal7 = tVal4 + tVal12 266 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[5] 267 M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[6] 268 M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[7] 269 270 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 271 ADD sum1, tVal6, tVal7 ;// sum1 = sum(pSrcLeft[0] to pSrcLeft[3]) 272 ADD tVal4, tVal4, tVal12 ;// tVal4 = tVal4 + tVal12 273 ADD sum2, tVal8, tVal4 ;// sum2 = sum(pSrcLeft[4] to pSrcLeft[7]) 274 275 ADD sum1, sum1, #2 ;// sum1 + 2 276 ADD sum2, sum2, #2 ;// sum2 + 2 277 278 MOV sum1, sum1, LSR #2 ;// (sum1 + 2)>>2 279 MOV sum2, sum2, LSR #2 ;// (sum2 + 2)>>2 280 281 MUL tVal6, sum1,r0x01010101 ;// replicate the val in all the bytes 282 MUL tVal8, sum2,r0x01010101 ;// replicate the val in all the bytes 283 284 ;// M_STALL ARM1136JS=1 285 MOV tVal7,tVal6 ;// tVal7 = sum1 286 MOV tVal9,tVal8 ;// tVal9 = sum2 287 288 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 289 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 290 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 291 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 292 293 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[32 to 39] = tVal 8 to 9 294 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[40 to 47] = tVal 8 to 9 295 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[48 to 55] = tVal 8 to 9 296 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[56 to 63] = tVal 8 to 9 297 298 MOV return, #OMX_Sts_NoErr 299 M_EXIT ;// Macro to exit midway-break frm case 300 301 TST_COUNT0 302 LDR sum1, =MUL_CONST1 ;// sum1 = 0x80808080 if(count == 0) 303 304 ;// M_STALL ARM1136JS=2 305 306 MOV tVal7, sum1 ;// tVal7 = sum1 307 308 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 309 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 310 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 311 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 312 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7 313 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7 314 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7 315 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7 316 317 MOV return, #OMX_Sts_NoErr 318 M_EXIT ;// Macro to exit midway-break frm case 319 320 OMX_VC_CHROMA_HOR 321 322 ;// M_STALL ARM1136JS=2 323 324 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 325 ADD leftStepx2, leftStep, leftStep ;// leftStepx2 = leftStep * 2 326 ADD pDst2, pDst, dstStep ;// pDst2 = pDst + dstStep 327 ADD dstStepx2, dstStep, dstStep ;// double dstStep 328 SUB dstStepx2, dstStepx2, #4 ;// double dstStep minus 4 329 LDR r0x01010101, =MUL_CONST0 ;// Const to repeat the byte in reg 4 times 330 M_LDRB tVal6, [pSrcLeft], +leftStepx2 ;// tVal6 = pSrcLeft[0] 331 M_LDRB tVal7, [pSrcLeft2],+leftStepx2 ;// tVal7 = pSrcLeft[1] 332 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[2] 333 M_LDRB tVal9, [pSrcLeft2],+leftStepx2 ;// tVal9 = pSrcLeft[3] 334 MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes 335 MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes 336 MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes 337 MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes 338 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 339 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 340 M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 341 M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 342 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 343 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 344 M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 345 M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 346 M_LDRB tVal6, [pSrcLeft], +leftStepx2 ;// tVal6 = pSrcLeft[4] 347 M_LDRB tVal7, [pSrcLeft2],+leftStepx2 ;// tVal7 = pSrcLeft[5] 348 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[6] 349 M_LDRB tVal9, [pSrcLeft2],+leftStepx2 ;// tVal9 = pSrcLeft[7] 350 MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes 351 MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes 352 MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes 353 MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes 354 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 355 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 356 M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 357 M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 358 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 359 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 360 M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 361 M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 362 MOV return, #OMX_Sts_NoErr 363 M_EXIT 364 365 OMX_VC_CHROMA_VERT 366 367 ;// M_STALL ARM1136JS=4 368 369 LDMIA pSrcAbove, {tVal6,tVal7} ;// tVal 6 to 7 = pSrcAbove[0 to 7] 370 MOV return, #OMX_Sts_NoErr 371 372 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 373 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 374 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 375 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 376 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7 377 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7 378 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7 379 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7 380 381 M_EXIT ;// Macro to exit midway-break frm case 382 383 OMX_VC_CHROMA_PLANE 384 385 ;// M_STALL ARM1136JS=3 386 387 RSB tVal14, leftStep, leftStep, LSL #3 ;// 7*leftStep 388 LDRB tVal7, [pSrcAbove, #+7] ;// pSrcAbove[7] 389 LDRB tVal6, [pSrcLeft, +tVal14] ;// pSrcLeft[7*leftStep] 390 LDRB tVal8, [pSrcAboveLeft] ;// pSrcAboveLeft[0] 391 LDRB tVal9, [pSrcAbove, #+6 ] ;// pSrcAbove[6] 392 LDRB tVal10,[pSrcAbove] ;// pSrcAbove[0] 393 ADD tVal2, tVal7, tVal6 ;// pSrcAbove[7] + pSrcLeft[7*leftStep] 394 SUB tVal6, tVal6, tVal8 ;// V0 = pSrcLeft[7*leftStep] - pSrcAboveLeft[0] 395 SUB tVal7, tVal7, tVal8 ;// H0 = pSrcAbove[7] - pSrcAboveLeft[0] 396 LSL tVal2, tVal2, #4 ;// a = 16 * (pSrcAbove[15] + pSrcLeft[15*lS]) 397 ADD tVal2, tVal2, #16 ;// a + 16 398 SUB tVal9, tVal9,tVal10 ;// pSrcAbove[6] - pSrcAbove[0] 399 LDRB tVal8, [pSrcAbove,#+5] ;// pSrcAbove[5] 400 LDRB tVal10,[pSrcAbove,#+1] ;// pSrcAbove[1] 401 ADD tVal9, tVal9, tVal9, LSL #1 ;// H1 = 3 * (pSrcAbove[6] - pSrcAbove[0]) 402 ADD tVal7, tVal9, tVal7, LSL #2 ;// H = H1 + H0 403 SUB tVal8, tVal8, tVal10 ;// pSrcAbove[5] - pSrcAbove[1] 404 LDRB tVal9, [pSrcAbove,#+4] ;// pSrcAbove[4] 405 LDRB tVal10,[pSrcAbove,#+2] ;// pSrcAbove[2] 406 ADD tVal7, tVal7, tVal8, LSL #1 ;// H = H + H2 407 SUB tVal11, tVal14,leftStep ;// 6*leftStep 408 ADD tVal11, pSrcLeft, tVal11 ;// pSrcLeft + 6*leftStep 409 MOV tVal12, pSrcLeft ;// pSrcLeft 410 SUB tVal9, tVal9, tVal10 ;// pSrcAbove[4] - pSrcAbove[2] 411 ADD tVal7, tVal7, tVal9 ;// H = H + H3 412 M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[6*leftStep] 413 M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[0] 414 ADD tVal7, tVal7, tVal7, LSL #4 ;// 17 * H 415 ADD tVal7, tVal7, #16 ;// 17 * H + 16 416 SUB tVal8, tVal8, tVal10 ;// pSrcLeft[6*leftStep] - pSrcLeft[0] 417 ASR b, tVal7, #5 ;// b = (17 * H + 16) >> 5 418 ADD tVal8, tVal8, tVal8, LSL #1 ;// V1 = 3 * (pSrcLeft[6*leftStep] - pSrcLeft[0]) 419 ADD tVal6, tVal8, tVal6, LSL #2 ;// V = V0 +V1 420 M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[5*leftStep] 421 M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[leftStep] 422 ADD tVal7, b, b, LSL #1 ;// 3*b 423 SUB tVal2, tVal2, tVal7 ;// a + 16 - 3*b 424 SUB tVal7, tVal8, tVal10 ;// pSrcLeft[5*leftStep] - pSrcLeft[leftStep] 425 M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[4*leftStep] 426 M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[2*leftStep] 427 ADD tVal6, tVal6, tVal7, LSL #1 ;// V = V + V2 428 LDR r0x00FF00FF, =MASK_CONST ;// r0x00FF00FF = 0x00FF00FF 429 SUB tVal7, tVal8, tVal10 ;// pSrcLeft[4*leftStep] - pSrcLeft[2*leftStep] 430 ADD tVal6, tVal6, tVal7 ;// V = V + V7 431 SUB dstStep, dstStep, #4 ;// dstStep - 4 432 ADD tVal6, tVal6, tVal6, LSL #4 ;// 17*V 433 ADD tVal6, tVal6, #16 ;// 17*V + 16 434 435 ;// M_STALL ARM1136JS=1 436 437 ASR c, tVal6, #5 ;// c = (17*V + 16)>>5 438 439 ;// M_STALL ARM1136JS=1 440 441 ADD tVal6, c, c, LSL #1 ;// 3*c 442 UXTH c, c ;// only in half word 443 SUB tVal6, tVal2, tVal6 ;// a - 3*b - 3*c + 16 444 ORR c, c, c, LSL #16 ;// c c 445 ADD tVal7, b, b ;// 2b 446 ADD tVal2, tVal6, tVal7 ;// pp2 = d + 2*b 447 ADD tVal7, tVal7, b ;// 3b 448 ORR p2p0, tVal6, tVal2, LSL #16 ;// p2p0 = pack {p2, p0} 449 UXTH b, b 450 UXTH tVal7, tVal7 451 ORR b, b, b, LSL #16 ;// {b,b} 452 ORR tVal7, tVal7, tVal7, LSL #16 ;// {3b,3b} 453 SADD16 p3p1, p2p0, b ;// p3p1 = p2p0 + {b,b} 454 SADD16 p6p4, p3p1, tVal7 ;// p6p4 = p3p1 + {3b,3b} 455 SADD16 p7p5, p6p4, b ;// p7p5 = p6p4 + {b,b} 456 MOV outerCount, #BLK_SIZE ;// Outer Loop Count 457 458 LOOP_PLANE 459 460 USAT16 p7p5, #13, p7p5 ;// clip13(p7) clip13(p5) 461 USAT16 p6p4, #13, p6p4 ;// clip13(p6) clip13(p4) 462 USAT16 p3p1, #13, p3p1 ;// clip13(p3) clip13(p1) 463 USAT16 p2p0, #13, p2p0 ;// clip13(p2) clip13(p0) 464 465 AND pp7pp5, r0x00FF00FF, p7p5, ASR #5 ;// clip8(p7) clip8(p5) 466 AND pp6pp4, r0x00FF00FF, p6p4, ASR #5 ;// clip8(p6) clip8(p4) 467 AND pp3pp1, r0x00FF00FF, p3p1, ASR #5 ;// clip8(p3) clip8(p1) 468 AND pp2pp0, r0x00FF00FF, p2p0, ASR #5 ;// clip8(p2) clip8(p0) 469 470 SUBS outerCount, outerCount, #1 ;// outerCount-- 471 472 ORR p3210, pp2pp0, pp3pp1, LSL #8 ;// pack {p3,p2, p1, p0} 473 STR p3210, [pDst], #4 ;// store {pDst[0] to pDst[3]} 474 475 ORR p7654, pp6pp4, pp7pp5, LSL #8 ;// pack {p7,p6, p5, p4} 476 M_STR p7654, [pDst], dstStep ;// store {pDst[4] to pDst[7]} 477 478 SADD16 p7p5, p7p5, c ;// {p7 + c}, {p5 + c} 479 SADD16 p6p4, p6p4, c ;// {p6 + c}, {p4 + c} 480 SADD16 p3p1, p3p1, c ;// {p3 + c}, {p1 + c} 481 SADD16 p2p0, p2p0, c ;// {p2 + c}, {p0 + c} 482 483 BNE LOOP_PLANE ;// Loop for 8 times 484 MOV return, #OMX_Sts_NoErr 485 M_END 486 487 ENDIF ;// ARM1136JS 488 489 490 491 END 492 ;//----------------------------------------------------------------------------------------------- 493 ;// omxVCM4P10_PredictIntraChroma_8x8 ends 494 ;//----------------------------------------------------------------------------------------------- 495