1 ;// 2 ;// 3 ;// File Name: armVCM4P10_TransformResidual4x4_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 9641 6 ;// Date: Thursday, February 7, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// Transform Residual 4x4 Coefficients 14 ;// 15 ;// 16 17 18 ;// Include standard headers 19 20 INCLUDE omxtypes_s.h 21 INCLUDE armCOMM_s.h 22 23 M_VARIANTS ARM1136JS 24 25 ;// Import symbols required from other files 26 ;// (For example tables) 27 28 29 30 31 ;// Set debugging level 32 ;//DEBUG_ON SETL {TRUE} 33 34 35 36 ;// Guarding implementation by the processor name 37 38 IF ARM1136JS 39 40 ;//Input Registers 41 pDst RN 0 42 pSrc RN 1 43 44 ;//Output Registers 45 46 47 ;//Local Scratch Registers 48 49 ;// Packed Input pixels 50 in00 RN 2 ;// Src[0] & Src[1] 51 in02 RN 3 ;// Src[2] & Src[3] 52 in10 RN 4 ;// Src[4] & Src[5] 53 in12 RN 5 ;// Src[6] & Src[7] 54 in20 RN 6 ;// Src[8] & Src[9] 55 in22 RN 7 ;// Src[10] & Src[11] 56 in30 RN 8 ;// Src[12] & Src[13] 57 in32 RN 9 ;// Src[14] & Src[15] 58 59 ;// Transpose for Row operations (Rows to cols) 60 trRow00 RN 2 61 trRow10 RN 10 62 trRow02 RN 3 63 trRow12 RN 5 64 trRow20 RN 11 65 trRow30 RN 12 66 trRow32 RN 14 67 trRow22 RN 7 68 69 ;// Intermediate calculations 70 e0 RN 4 71 e1 RN 6 72 e2 RN 8 73 e3 RN 9 74 constZero RN 1 75 76 ;// Row operated pixels 77 rowOp00 RN 2 78 rowOp10 RN 10 79 rowOp20 RN 11 80 rowOp30 RN 12 81 rowOp02 RN 3 82 rowOp12 RN 5 83 rowOp22 RN 7 84 rowOp32 RN 14 85 86 ;// Transpose for colulmn operations 87 trCol00 RN 2 88 trCol02 RN 3 89 trCol10 RN 4 90 trCol12 RN 5 91 trCol20 RN 6 92 trCol22 RN 7 93 trCol30 RN 8 94 trCol32 RN 9 95 96 ;// Intermediate calculations 97 g0 RN 10 98 g1 RN 11 99 g2 RN 12 100 g3 RN 14 101 102 ;// Coloumn operated pixels 103 colOp00 RN 2 104 colOp02 RN 3 105 colOp10 RN 4 106 colOp12 RN 5 107 colOp20 RN 6 108 colOp22 RN 7 109 colOp30 RN 8 110 colOp32 RN 9 111 112 113 temp1 RN 10 ;// Temporary scratch varaibles 114 const1 RN 11 115 const2 RN 12 116 mask RN 14 117 118 ;// Output pixels 119 out00 RN 2 120 out02 RN 3 121 out10 RN 4 122 out12 RN 5 123 out20 RN 6 124 out22 RN 7 125 out30 RN 8 126 out32 RN 9 127 128 129 130 ;// Allocate stack memory required by the function 131 132 133 ;// Write function header 134 M_START armVCM4P10_TransformResidual4x4,r11 135 136 ;****************************************************************** 137 ;// The strategy used in implementing the transform is as follows:* 138 ;// Load the 4x4 block into 8 registers * 139 ;// Transpose the 4x4 matrix * 140 ;// Perform the row operations (on columns) using SIMD * 141 ;// Transpose the 4x4 result matrix * 142 ;// Perform the coloumn operations * 143 ;// Store the 4x4 block at one go * 144 ;****************************************************************** 145 146 ;// Load all the 4x4 pixels 147 148 LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} 149 150 MOV constZero,#0 ;// Used to right shift by 1 151 ;LDR constZero,=0x00000000 152 153 ;***************************************************************** 154 ;// 155 ;// Transpose the matrix inorder to perform row ops as coloumn ops 156 ;// Input: in[][] = original matrix 157 ;// Output: trRow[][]= transposed matrix 158 ;// Step1: Obtain the LL part of the transposed matrix 159 ;// Step2: Obtain the HL part 160 ;// step3: Obtain the LH part 161 ;// Step4: Obtain the HH part 162 ;// 163 ;***************************************************************** 164 165 ;// LL 2x2 transposed matrix 166 ;// d0 d1 - - 167 ;// d4 d5 - - 168 ;// - - - - 169 ;// - - - - 170 171 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 172 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 173 174 ;// HL 2x2 transposed matrix 175 ;// - - - - 176 ;// - - - - 177 ;// d8 d9 - - 178 ;// d12 d13 - - 179 180 181 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 182 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 183 184 ;// LH 2x2 transposed matrix 185 ;// - - d2 d3 186 ;// - - d6 d7 187 ;// - - - - 188 ;// - - - - 189 190 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 191 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 192 193 194 195 196 ;// HH 2x2 transposed matrix 197 ;// - - - - 198 ;// - - - - 199 ;// - - d10 d11 200 ;// - - d14 d15 201 202 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 203 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 204 205 206 ;**************************************** 207 ;// Row Operations (Performed on columns) 208 ;**************************************** 209 210 211 ;// SIMD operations on first two columns(two rows of the original matrix) 212 213 214 SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 215 SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 216 SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 217 SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 218 SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 219 SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) 220 SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 221 SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 222 SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 223 SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 224 225 ;// SIMD operations on next two columns(next two rows of the original matrix) 226 227 SADD16 e0, trRow02,trRow22 228 SSUB16 e1, trRow02,trRow22 229 SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 230 SHADD16 e3, trRow32,constZero 231 SSUB16 e2, e2, trRow32 232 SADD16 e3, e3, trRow12 233 SADD16 rowOp02, e0, e3 234 SADD16 rowOp12, e1, e2 235 SSUB16 rowOp22, e1, e2 236 SSUB16 rowOp32, e0, e3 237 238 239 ;***************************************************************** 240 ;// Transpose the resultant matrix 241 ;// Input: rowOp[][] 242 ;// Output: trCol[][] 243 ;***************************************************************** 244 245 ;// LL 2x2 transposed matrix 246 ;// d0 d1 - - 247 ;// d4 d5 - - 248 ;// - - - - 249 ;// - - - - 250 251 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 252 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 253 254 ;// HL 2x2 transposed matrix 255 ;// - - - - 256 ;// - - - - 257 ;// d8 d9 - - 258 ;// d12 d13 - - 259 260 261 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 262 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 263 264 ;// LH 2x2 transposed matrix 265 ;// - - d2 d3 266 ;// - - d6 d7 267 ;// - - - - 268 ;// - - - - 269 270 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 271 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 272 273 274 275 276 ;// HH 2x2 transposed matrix 277 ;// - - - - 278 ;// - - - - 279 ;// - - d10 d11 280 ;// - - d14 d15 281 282 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 283 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 284 285 286 ;******************************* 287 ;// Coloumn Operations 288 ;******************************* 289 290 291 ;// SIMD operations on first two columns 292 293 294 SADD16 g0, trCol00,trCol20 295 SSUB16 g1, trCol00,trCol20 296 SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 297 SHADD16 g3, trCol30,constZero 298 SSUB16 g2, g2, trCol30 299 SADD16 g3, g3, trCol10 300 SADD16 colOp00, g0, g3 301 SADD16 colOp10, g1, g2 302 SSUB16 colOp20, g1, g2 303 SSUB16 colOp30, g0, g3 304 305 ;// SIMD operations on next two columns 306 307 SADD16 g0, trCol02,trCol22 308 SSUB16 g1, trCol02,trCol22 309 SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 310 SHADD16 g3, trCol32,constZero 311 SSUB16 g2, g2, trCol32 312 SADD16 g3, g3, trCol12 313 SADD16 colOp02, g0, g3 314 SADD16 colOp12, g1, g2 315 SSUB16 colOp22, g1, g2 316 SSUB16 colOp32, g0, g3 317 318 319 320 321 322 ;************************************************ 323 ;// Calculate final value (colOp[i][j] + 32)>>6 324 ;************************************************ 325 326 ;// const1: Serves dual purpose 327 ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result 328 ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) 329 330 LDR const1, =0x00208020 331 332 LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits 333 334 ;// const2(#512): used to convert the lower 16bit number back to signed value 335 336 MOV const2,#0x200 ;// const2 = 2^9 337 338 ;// First Row 339 340 SADD16 colOp00, colOp00, const1 341 SADD16 colOp02, colOp02, const1 342 AND colOp00, mask, colOp00, ASR #6 343 AND colOp02, mask, colOp02, ASR #6 344 SSUB16 out00,colOp00,const2 345 SSUB16 out02,colOp02,const2 346 347 348 ;// Second Row 349 350 SADD16 colOp10, colOp10, const1 351 SADD16 colOp12, colOp12, const1 352 AND colOp10, mask, colOp10, ASR #6 353 AND colOp12, mask, colOp12, ASR #6 354 SSUB16 out10,colOp10,const2 355 SSUB16 out12,colOp12,const2 356 357 358 ;// Third Row 359 360 SADD16 colOp20, colOp20, const1 361 SADD16 colOp22, colOp22, const1 362 AND colOp20, mask, colOp20, ASR #6 363 AND colOp22, mask, colOp22, ASR #6 364 SSUB16 out20,colOp20,const2 365 SSUB16 out22,colOp22,const2 366 367 368 ;// Fourth Row 369 370 SADD16 colOp30, colOp30, const1 371 SADD16 colOp32, colOp32, const1 372 AND colOp30, mask, colOp30, ASR #6 373 AND colOp32, mask, colOp32, ASR #6 374 SSUB16 out30,colOp30,const2 375 SSUB16 out32,colOp32,const2 376 377 378 379 380 ;*************************** 381 ;// Store all the 4x4 pixels 382 ;*************************** 383 384 STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} 385 386 387 388 ;// Set return value 389 390 End 391 392 393 ;// Write function tail 394 M_END 395 396 ENDIF ;//ARM1136JS 397 398 399 400 401 402 403 404 ;// Guarding implementation by the processor name 405 406 407 END