1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 ;// Description: 27 ;// H.264 inverse quantize and transform module 28 ;// 29 ;// 30 31 ;// Include standard headers 32 33 INCLUDE omxtypes_s.h 34 INCLUDE armCOMM_s.h 35 36 ;// Import/Export symbols required from/to other files 37 ;// (For example tables) 38 39 IMPORT armVCM4P10_UnpackBlock4x4 40 IMPORT armVCM4P10_QPDivTable 41 IMPORT armVCM4P10_VMatrixQPModTable 42 43 M_VARIANTS CortexA8 44 45 ;// Set debugging level 46 ;//DEBUG_ON SETL {TRUE} 47 48 49 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 50 51 52 ;// Guarding implementation by the processor name 53 54 55 56 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 57 58 ;// Guarding implementation by the processor name 59 60 IF CortexA8 61 62 ;//Input Registers 63 pData RN 0 64 QP RN 1 65 66 67 ;//Local Scratch Registers 68 69 ;// ARM Registers 70 71 pQPDivTable RN 2 72 pQPModTable RN 3 73 Shift RN 4 74 Scale RN 5 75 76 ;// NEON Registers 77 78 ;// Packed Input pixels 79 dIn0 DN D0.S16 80 dIn1 DN D1.S16 81 dIn2 DN D2.S16 82 dIn3 DN D3.S16 83 84 ;// Intermediate calculations 85 dRowSum1 DN D4.S16 86 dRowSum2 DN D5.S16 87 dRowDiff1 DN D6.S16 88 dRowDiff2 DN D7.S16 89 90 ;// Row operated pixels 91 dRowOp0 DN D0.S16 92 dRowOp1 DN D1.S16 93 dRowOp2 DN D2.S16 94 dRowOp3 DN D3.S16 95 qRowOp01 QN Q0.32 96 qRowOp23 QN Q1.32 97 98 ;// Intermediate calculations 99 dColSum1 DN D4.S16 100 dColSum2 DN D5.S16 101 dColDiff1 DN D6.S16 102 dColDiff2 DN D7.S16 103 104 ;// Coloumn operated pixels 105 dColOp0 DN D0.S16 106 dColOp1 DN D1.S16 107 dColOp2 DN D2.S16 108 dColOp3 DN D3.S16 109 110 ;// Temporary scratch varaibles 111 112 dScale DN D5.S16 113 qRound0 QN Q3.S32 114 qRound1 QN Q4.S32 115 qRound2 QN Q5.S32 116 qRound3 QN Q6.S32 117 118 ;// InvTransformed and Dequantized pixels 119 dOut0 DN D0.S16 120 dOut1 DN D1.S16 121 dOut2 DN D2.S16 122 dOut3 DN D3.S16 123 124 125 ;// Allocate stack memory required by the function 126 127 128 ;// Write function header 129 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 130 131 ;****************************************************************** 132 ;// The strategy used in implementing the transform is as follows:* 133 ;// Load the 4x4 block into 4 D-registers * 134 ;// Transpose the 4x4 matrix * 135 ;// Perform the row operations (on columns) using SIMD * 136 ;// Transpose the 4x4 result matrix * 137 ;// Perform the coloumn operations * 138 ;****************************************************************** 139 140 ;// Load all the 4x4 pixels in Transposed form 141 142 VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] 143 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 144 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 145 146 ;**************************************** 147 ;// Row Operations (Performed on columns) 148 ;**************************************** 149 ;// Scale factor calculation is done using ARM instructions 150 ;// Interleaved with NEON instructions inorder to Dual issue 151 152 VADD dRowSum1,dIn0,dIn1 153 VADD dRowSum2,dIn2,dIn3 154 VSUB dRowDiff1,dIn0,dIn1 155 LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] 156 VSUB dRowDiff2,dIn2,dIn3 157 LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] 158 VADD dRowOp0,dRowSum1,dRowSum2 159 VSUB dRowOp1,dRowSum1,dRowSum2 160 VSUB dRowOp2,dRowDiff1,dRowDiff2 161 LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift 162 VADD dRowOp3,dRowDiff1,dRowDiff2 163 164 ;**************************************** 165 ;// Transpose the resultant matrix 166 ;**************************************** 167 168 VTRN dRowOp0,dRowOp1 169 VTRN dRowOp2,dRowOp3 170 VTRN qRowOp01,qRowOp23 171 172 ;**************************************** 173 ;// Coloumn Operations 174 ;**************************************** 175 176 VADD dColSum1,dRowOp0,dRowOp1 177 VADD dColSum2,dRowOp2,dRowOp3 178 VSUB dColDiff1,dRowOp0,dRowOp1 179 VSUB dColDiff2,dRowOp2,dRowOp3 180 VADD dColOp0,dColSum1,dColSum2 181 VSUB dColOp1,dColSum1,dColSum2 182 VSUB dColOp2,dColDiff1,dColDiff2 183 VADD dColOp3,dColDiff1,dColDiff2 184 185 ;//---------------------------------------------------------------------- 186 ;// 187 ;// <Dequantize> improves on the c-reference code 188 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 189 ;// We do not subtract 2 from Shift as in C reference, instead perform a 190 ;// Scale << Shift once in the beginning and do a right shift by a 191 ;// constant 2 after the Multiplication. The value of Round would be 2 192 ;// 193 ;// By doing this we aviod the Branches required and also 194 ;// reduce the code size substantially 195 ;// 196 ;//---------------------------------------------------------------------- 197 198 199 VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector 200 201 202 VMOV qRound0,#2 ;// Set the Round Value 203 VMOV qRound1,#2 204 VMOV qRound2,#2 205 VMOV qRound3,#2 206 207 VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round 208 VMLAL qRound1,dColOp1,dScale 209 VMLAL qRound2,dColOp2,dScale 210 VMLAL qRound3,dColOp3,dScale 211 212 VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value 213 VSHRN dOut1,qRound1,#2 214 VSHRN dOut2,qRound2,#2 215 VSHRN dOut3,qRound3,#2 216 217 ;*************************** 218 ;// Store all the 4x4 pixels 219 ;*************************** 220 221 VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] 222 223 224 ;// Set return value 225 226 ;// Write function tail 227 M_END 228 229 ENDIF ;//CORTEXA8 230 231 232 233 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 234 235 ;//Input Registers 236 ppSrc RN 0 237 pDst RN 1 238 QPR2 RN 2 239 240 ;//Output Registers 241 result RN 0 242 243 ;//Local Scratch Registers 244 pDstR4 RN 4 245 pDstR0 RN 0 246 QPR1 RN 1 247 QPR5 RN 5 248 249 ;// Guarding implementation by the processor name 250 251 IF CortexA8 252 253 ;// Allocate stack memory required by the function 254 255 256 ;// Write function header 257 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 258 259 MOV pDstR4,pDst ;// Saving register r1 260 MOV QPR5,QPR2 ;// Saving register r2 261 BL armVCM4P10_UnpackBlock4x4 262 263 MOV pDstR0,pDstR4 ;// Setting up register r0 264 MOV QPR1,QPR5 ;// Setting up register r1 265 BL armVCM4P10_InvTransformDequantLumaDC4x4 266 267 268 ;// Set return value 269 MOV result,#OMX_Sts_NoErr 270 271 ;// Write function tail 272 M_END 273 274 275 ENDIF ;//ARM1136JS 276 277 278 END 279