1 ;// 2 ;// 3 ;// File Name: armVCM4P10_TransformResidual4x4_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// Transform Residual 4x4 Coefficients 14 ;// 15 ;// 16 17 18 ;// Include standard headers 19 20 INCLUDE omxtypes_s.h 21 INCLUDE armCOMM_s.h 22 23 M_VARIANTS CortexA8 24 25 ;// Import symbols required from other files 26 ;// (For example tables) 27 28 29 30 31 ;// Set debugging level 32 ;//DEBUG_ON SETL {TRUE} 33 34 35 36 ;// Guarding implementation by the processor name 37 38 39 40 41 42 43 44 45 ;// Guarding implementation by the processor name 46 47 IF CortexA8 48 49 ;// ARM Registers 50 51 ;//Input Registers 52 pDst RN 0 53 pSrc RN 1 54 55 56 ;// Neon Registers 57 58 ;// Packed Input pixels 59 dIn0 DN D0.S16 60 dIn1 DN D1.S16 61 dIn2 DN D2.S16 62 dIn3 DN D3.S16 63 64 ;// Intermediate calculations 65 dZero DN D4.S16 66 de0 DN D5.S16 67 de1 DN D6.S16 68 de2 DN D7.S16 69 de3 DN D8.S16 70 dIn1RS DN D7.S16 71 dIn3RS DN D8.S16 72 df0 DN D0.S16 73 df1 DN D1.S16 74 df2 DN D2.S16 75 df3 DN D3.S16 76 qf01 QN Q0.32 77 qf23 QN Q1.32 78 dg0 DN D5.S16 79 dg1 DN D6.S16 80 dg2 DN D7.S16 81 dg3 DN D8.S16 82 df1RS DN D7.S16 83 df3RS DN D8.S16 84 85 ;// Output pixels 86 dh0 DN D0.S16 87 dh1 DN D1.S16 88 dh2 DN D2.S16 89 dh3 DN D3.S16 90 91 92 ;// Allocate stack memory required by the function 93 94 95 ;// Write function header 96 M_START armVCM4P10_TransformResidual4x4, ,d8 97 98 ;****************************************************************** 99 ;// The strategy used in implementing the transform is as follows:* 100 ;// Load the 4x4 block into 8 registers * 101 ;// Transpose the 4x4 matrix * 102 ;// Perform the row operations (on columns) using SIMD * 103 ;// Transpose the 4x4 result matrix * 104 ;// Perform the coloumn operations * 105 ;// Store the 4x4 block at one go * 106 ;****************************************************************** 107 108 ;// Load all the 4x4 pixels in transposed form 109 110 VLD4 {dIn0,dIn1,dIn2,dIn3},[pSrc] 111 112 VMOV dZero,#0 ;// Used to right shift by 1 113 114 115 ;**************************************** 116 ;// Row Operations (Performed on columns) 117 ;**************************************** 118 119 120 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 121 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 122 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 123 VHADD dIn3RS,dIn3,dZero 124 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 125 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 126 VADD df0,de0,de3 ;// f0 = e0 + e3 127 VADD df1,de1,de2 ;// f1 = e1 + e2 128 VSUB df2,de1,de2 ;// f2 = e1 - e2 129 VSUB df3,de0,de3 ;// f3 = e0 - e3 130 131 132 133 ;***************************************************************** 134 ;// Transpose the resultant matrix 135 ;***************************************************************** 136 137 VTRN df0,df1 138 VTRN df2,df3 139 VTRN qf01,qf23 140 141 142 ;******************************* 143 ;// Coloumn Operations 144 ;******************************* 145 146 147 VADD dg0,df0,df2 ;// e0 = d0 + d2 148 VSUB dg1,df0,df2 ;// e1 = d0 - d2 149 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 150 VHADD df3RS,df3,dZero 151 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 152 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 153 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 154 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 155 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 156 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 157 158 159 ;************************************************ 160 ;// Calculate final value (colOp[i][j] + 32)>>6 161 ;************************************************ 162 163 VRSHR dh0,#6 164 VRSHR dh1,#6 165 VRSHR dh2,#6 166 VRSHR dh3,#6 167 168 169 ;*************************** 170 ;// Store all the 4x4 pixels 171 ;*************************** 172 173 VST1 {dh0,dh1,dh2,dh3},[pDst] 174 175 176 ;// Set return value 177 178 End 179 180 181 ;// Write function tail 182 M_END 183 184 ENDIF ;//CortexA8 185 186 END