1 ; 2 ; Copyright (c) 2010 The Webm project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 EXPORT |idct_dequant_full_2x_neon| 13 ARM 14 REQUIRE8 15 PRESERVE8 16 17 AREA ||.text||, CODE, READONLY, ALIGN=2 18 ;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre, 19 ; unsigned char *dst, int pitch, int stride); 20 ; r0 *q, 21 ; r1 *dq, 22 ; r2 *pre 23 ; r3 *dst 24 ; sp pitch 25 ; sp+4 stride 26 |idct_dequant_full_2x_neon| PROC 27 vld1.16 {q0, q1}, [r1] ; dq (same l/r) 28 vld1.16 {q2, q3}, [r0] ; l q 29 ldr r1, [sp] ; pitch 30 add r0, r0, #32 31 vld1.16 {q4, q5}, [r0] ; r q 32 add r12, r2, #4 33 ; interleave the predictors 34 vld1.32 {d28[0]}, [r2], r1 ; l pre 35 vld1.32 {d28[1]}, [r12], r1 ; r pre 36 vld1.32 {d29[0]}, [r2], r1 37 vld1.32 {d29[1]}, [r12], r1 38 vld1.32 {d30[0]}, [r2], r1 39 vld1.32 {d30[1]}, [r12], r1 40 vld1.32 {d31[0]}, [r2] 41 vld1.32 {d31[1]}, [r12] 42 43 ldr r2, _CONSTANTS_ 44 45 ; dequant: q[i] = q[i] * dq[i] 46 vmul.i16 q2, q2, q0 47 vmul.i16 q3, q3, q1 48 vmul.i16 q4, q4, q0 49 vmul.i16 q5, q5, q1 50 51 vld1.16 {d0}, [r2] 52 53 ; q2: l0r0 q3: l8r8 54 ; q4: l4r4 q5: l12r12 55 vswp d5, d8 56 vswp d7, d10 57 58 ; _CONSTANTS_ * 4,12 >> 16 59 ; q6: 4 * sinpi : c1/temp1 60 ; q7: 12 * sinpi : d1/temp2 61 ; q8: 4 * cospi 62 ; q9: 12 * cospi 63 vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 64 vqdmulh.s16 q7, q5, d0[2] 65 vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 66 vqdmulh.s16 q9, q5, d0[0] 67 68 vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 69 vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 70 71 ; vqdmulh only accepts signed values. this was a problem because 72 ; our constant had the high bit set, and was treated as a negative value. 73 ; vqdmulh also doubles the value before it shifts by 16. we need to 74 ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, 75 ; so we can shift the constant without losing precision. this avoids 76 ; shift again afterward, but also avoids the sign issue. win win! 77 ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we 78 ; pre-shift it 79 vshr.s16 q8, q8, #1 80 vshr.s16 q9, q9, #1 81 82 ; q4: 4 + 4 * cospi : d1/temp1 83 ; q5: 12 + 12 * cospi : c1/temp2 84 vqadd.s16 q4, q4, q8 85 vqadd.s16 q5, q5, q9 86 87 ; c1 = temp1 - temp2 88 ; d1 = temp1 + temp2 89 vqsub.s16 q2, q6, q5 90 vqadd.s16 q3, q4, q7 91 92 ; [0]: a1+d1 93 ; [1]: b1+c1 94 ; [2]: b1-c1 95 ; [3]: a1-d1 96 vqadd.s16 q4, q10, q3 97 vqadd.s16 q5, q11, q2 98 vqsub.s16 q6, q11, q2 99 vqsub.s16 q7, q10, q3 100 101 ; rotate 102 vtrn.32 q4, q6 103 vtrn.32 q5, q7 104 vtrn.16 q4, q5 105 vtrn.16 q6, q7 106 ; idct loop 2 107 ; q4: l 0, 4, 8,12 r 0, 4, 8,12 108 ; q5: l 1, 5, 9,13 r 1, 5, 9,13 109 ; q6: l 2, 6,10,14 r 2, 6,10,14 110 ; q7: l 3, 7,11,15 r 3, 7,11,15 111 112 ; q8: 1 * sinpi : c1/temp1 113 ; q9: 3 * sinpi : d1/temp2 114 ; q10: 1 * cospi 115 ; q11: 3 * cospi 116 vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 117 vqdmulh.s16 q9, q7, d0[2] 118 vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 119 vqdmulh.s16 q11, q7, d0[0] 120 121 vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 122 vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 123 124 ; see note on shifting above 125 vshr.s16 q10, q10, #1 126 vshr.s16 q11, q11, #1 127 128 ; q10: 1 + 1 * cospi : d1/temp1 129 ; q11: 3 + 3 * cospi : c1/temp2 130 vqadd.s16 q10, q5, q10 131 vqadd.s16 q11, q7, q11 132 133 ; q8: c1 = temp1 - temp2 134 ; q9: d1 = temp1 + temp2 135 vqsub.s16 q8, q8, q11 136 vqadd.s16 q9, q10, q9 137 138 ; a1+d1 139 ; b1+c1 140 ; b1-c1 141 ; a1-d1 142 vqadd.s16 q4, q2, q9 143 vqadd.s16 q5, q3, q8 144 vqsub.s16 q6, q3, q8 145 vqsub.s16 q7, q2, q9 146 147 ; +4 >> 3 (rounding) 148 vrshr.s16 q4, q4, #3 ; lo 149 vrshr.s16 q5, q5, #3 150 vrshr.s16 q6, q6, #3 ; hi 151 vrshr.s16 q7, q7, #3 152 153 vtrn.32 q4, q6 154 vtrn.32 q5, q7 155 vtrn.16 q4, q5 156 vtrn.16 q6, q7 157 158 ; adding pre 159 ; input is still packed. pre was read interleaved 160 vaddw.u8 q4, q4, d28 161 vaddw.u8 q5, q5, d29 162 vaddw.u8 q6, q6, d30 163 vaddw.u8 q7, q7, d31 164 165 vmov.i16 q14, #0 166 vmov q15, q14 167 vst1.16 {q14, q15}, [r0] ; write over high input 168 sub r0, r0, #32 169 vst1.16 {q14, q15}, [r0] ; write over low input 170 171 ;saturate and narrow 172 vqmovun.s16 d0, q4 ; lo 173 vqmovun.s16 d1, q5 174 vqmovun.s16 d2, q6 ; hi 175 vqmovun.s16 d3, q7 176 177 ldr r1, [sp, #4] ; stride 178 add r2, r3, #4 ; hi 179 vst1.32 {d0[0]}, [r3], r1 ; lo 180 vst1.32 {d0[1]}, [r2], r1 ; hi 181 vst1.32 {d1[0]}, [r3], r1 182 vst1.32 {d1[1]}, [r2], r1 183 vst1.32 {d2[0]}, [r3], r1 184 vst1.32 {d2[1]}, [r2], r1 185 vst1.32 {d3[0]}, [r3] 186 vst1.32 {d3[1]}, [r2] 187 188 bx lr 189 190 ENDP ; |idct_dequant_full_2x_neon| 191 192 ; Constant Pool 193 _CONSTANTS_ DCD cospi8sqrt2minus1 194 cospi8sqrt2minus1 DCD 0x4e7b 195 ; because the lowest bit in 0x8a8c is 0, we can pre-shift this 196 sinpi8sqrt2 DCD 0x4546 197 198 END 199