1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// H.264 inverse quantize and transform module 14 ;// 15 ;// 16 17 ;// Include standard headers 18 19 INCLUDE omxtypes_s.h 20 INCLUDE armCOMM_s.h 21 22 ;// Import/Export symbols required from/to other files 23 ;// (For example tables) 24 25 IMPORT armVCM4P10_UnpackBlock4x4 26 IMPORT armVCM4P10_QPDivTable 27 IMPORT armVCM4P10_VMatrixQPModTable 28 29 M_VARIANTS CortexA8 30 31 ;// Set debugging level 32 ;//DEBUG_ON SETL {TRUE} 33 34 35 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 36 37 38 ;// Guarding implementation by the processor name 39 40 41 42 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 43 44 ;// Guarding implementation by the processor name 45 46 IF CortexA8 47 48 ;//Input Registers 49 pData RN 0 50 QP RN 1 51 52 53 ;//Local Scratch Registers 54 55 ;// ARM Registers 56 57 pQPDivTable RN 2 58 pQPModTable RN 3 59 Shift RN 4 60 Scale RN 5 61 62 ;// NEON Registers 63 64 ;// Packed Input pixels 65 dIn0 DN D0.S16 66 dIn1 DN D1.S16 67 dIn2 DN D2.S16 68 dIn3 DN D3.S16 69 70 ;// Intermediate calculations 71 dRowSum1 DN D4.S16 72 dRowSum2 DN D5.S16 73 dRowDiff1 DN D6.S16 74 dRowDiff2 DN D7.S16 75 76 ;// Row operated pixels 77 dRowOp0 DN D0.S16 78 dRowOp1 DN D1.S16 79 dRowOp2 DN D2.S16 80 dRowOp3 DN D3.S16 81 qRowOp01 QN Q0.32 82 qRowOp23 QN Q1.32 83 84 ;// Intermediate calculations 85 dColSum1 DN D4.S16 86 dColSum2 DN D5.S16 87 dColDiff1 DN D6.S16 88 dColDiff2 DN D7.S16 89 90 ;// Coloumn operated pixels 91 dColOp0 DN D0.S16 92 dColOp1 DN D1.S16 93 dColOp2 DN D2.S16 94 dColOp3 DN D3.S16 95 96 ;// Temporary scratch varaibles 97 98 dScale DN D5.S16 99 qRound0 QN Q3.S32 100 qRound1 QN Q4.S32 101 qRound2 QN Q5.S32 102 qRound3 QN Q6.S32 103 104 ;// InvTransformed and Dequantized pixels 105 dOut0 DN D0.S16 106 dOut1 DN D1.S16 107 dOut2 DN D2.S16 108 dOut3 DN D3.S16 109 110 111 ;// Allocate stack memory required by the function 112 113 114 ;// Write function header 115 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 116 117 ;****************************************************************** 118 ;// The strategy used in implementing the transform is as follows:* 119 ;// Load the 4x4 block into 4 D-registers * 120 ;// Transpose the 4x4 matrix * 121 ;// Perform the row operations (on columns) using SIMD * 122 ;// Transpose the 4x4 result matrix * 123 ;// Perform the coloumn operations * 124 ;****************************************************************** 125 126 ;// Load all the 4x4 pixels in Transposed form 127 128 VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] 129 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 130 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 131 132 ;**************************************** 133 ;// Row Operations (Performed on columns) 134 ;**************************************** 135 ;// Scale factor calculation is done using ARM instructions 136 ;// Interleaved with NEON instructions inorder to Dual issue 137 138 VADD dRowSum1,dIn0,dIn1 139 VADD dRowSum2,dIn2,dIn3 140 VSUB dRowDiff1,dIn0,dIn1 141 LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] 142 VSUB dRowDiff2,dIn2,dIn3 143 LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] 144 VADD dRowOp0,dRowSum1,dRowSum2 145 VSUB dRowOp1,dRowSum1,dRowSum2 146 VSUB dRowOp2,dRowDiff1,dRowDiff2 147 LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift 148 VADD dRowOp3,dRowDiff1,dRowDiff2 149 150 ;**************************************** 151 ;// Transpose the resultant matrix 152 ;**************************************** 153 154 VTRN dRowOp0,dRowOp1 155 VTRN dRowOp2,dRowOp3 156 VTRN qRowOp01,qRowOp23 157 158 ;**************************************** 159 ;// Coloumn Operations 160 ;**************************************** 161 162 VADD dColSum1,dRowOp0,dRowOp1 163 VADD dColSum2,dRowOp2,dRowOp3 164 VSUB dColDiff1,dRowOp0,dRowOp1 165 VSUB dColDiff2,dRowOp2,dRowOp3 166 VADD dColOp0,dColSum1,dColSum2 167 VSUB dColOp1,dColSum1,dColSum2 168 VSUB dColOp2,dColDiff1,dColDiff2 169 VADD dColOp3,dColDiff1,dColDiff2 170 171 ;//---------------------------------------------------------------------- 172 ;// 173 ;// <Dequantize> improves on the c-reference code 174 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 175 ;// We do not subtract 2 from Shift as in C reference, instead perform a 176 ;// Scale << Shift once in the beginning and do a right shift by a 177 ;// constant 2 after the Multiplication. The value of Round would be 2 178 ;// 179 ;// By doing this we aviod the Branches required and also 180 ;// reduce the code size substantially 181 ;// 182 ;//---------------------------------------------------------------------- 183 184 185 VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector 186 187 188 VMOV qRound0,#2 ;// Set the Round Value 189 VMOV qRound1,#2 190 VMOV qRound2,#2 191 VMOV qRound3,#2 192 193 VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round 194 VMLAL qRound1,dColOp1,dScale 195 VMLAL qRound2,dColOp2,dScale 196 VMLAL qRound3,dColOp3,dScale 197 198 VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value 199 VSHRN dOut1,qRound1,#2 200 VSHRN dOut2,qRound2,#2 201 VSHRN dOut3,qRound3,#2 202 203 ;*************************** 204 ;// Store all the 4x4 pixels 205 ;*************************** 206 207 VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] 208 209 210 ;// Set return value 211 212 ;// Write function tail 213 M_END 214 215 ENDIF ;//CORTEXA8 216 217 218 219 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 220 221 ;//Input Registers 222 ppSrc RN 0 223 pDst RN 1 224 QPR2 RN 2 225 226 ;//Output Registers 227 result RN 0 228 229 ;//Local Scratch Registers 230 pDstR4 RN 4 231 pDstR0 RN 0 232 QPR1 RN 1 233 QPR5 RN 5 234 235 ;// Guarding implementation by the processor name 236 237 IF CortexA8 238 239 ;// Allocate stack memory required by the function 240 241 242 ;// Write function header 243 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 244 245 MOV pDstR4,pDst ;// Saving register r1 246 MOV QPR5,QPR2 ;// Saving register r2 247 BL armVCM4P10_UnpackBlock4x4 248 249 MOV pDstR0,pDstR4 ;// Setting up register r0 250 MOV QPR1,QPR5 ;// Setting up register r1 251 BL armVCM4P10_InvTransformDequantLumaDC4x4 252 253 254 ;// Set return value 255 MOV result,#OMX_Sts_NoErr 256 257 ;// Write function tail 258 M_END 259 260 261 ENDIF ;//ARM1136JS 262 263 264 END