1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <arm_neon.h> 12 13 #include "./vp8_rtcd.h" 14 15 static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, 16 int stride) { 17 unsigned char *dst0; 18 int i, a0, a1; 19 int16x8x2_t q2Add; 20 int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); 21 uint8x8_t d2u8, d4u8; 22 uint16x8_t q1u16, q2u16; 23 24 a0 = ((q[0] * dq) + 4) >> 3; 25 a1 = ((q[16] * dq) + 4) >> 3; 26 q[0] = q[16] = 0; 27 q2Add.val[0] = vdupq_n_s16((int16_t)a0); 28 q2Add.val[1] = vdupq_n_s16((int16_t)a1); 29 30 for (i = 0; i < 2; i++, dst += 4) { 31 dst0 = dst; 32 d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); 33 dst0 += stride; 34 d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); 35 dst0 += stride; 36 d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); 37 dst0 += stride; 38 d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); 39 40 q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), 41 vreinterpret_u8_s32(d2s32)); 42 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), 43 vreinterpret_u8_s32(d4s32)); 44 45 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); 46 d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); 47 48 d2s32 = vreinterpret_s32_u8(d2u8); 49 d4s32 = vreinterpret_s32_u8(d4u8); 50 51 dst0 = dst; 52 vst1_lane_s32((int32_t *)dst0, d2s32, 0); 53 dst0 += stride; 54 vst1_lane_s32((int32_t *)dst0, d2s32, 1); 55 dst0 += stride; 56 vst1_lane_s32((int32_t *)dst0, d4s32, 0); 57 dst0 += stride; 58 vst1_lane_s32((int32_t *)dst0, d4s32, 1); 59 } 60 return; 61 } 62 63 static const int16_t cospi8sqrt2minus1 = 20091; 64 static const int16_t sinpi8sqrt2 = 17734; 65 // because the lowest bit in 0x8a8c is 0, we can pre-shift this 66 67 static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, 68 unsigned char *dst, int stride) { 69 unsigned char *dst0, *dst1; 70 int32x2_t d28, d29, d30, d31; 71 int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 72 int16x8_t qEmpty = vdupq_n_s16(0); 73 int32x4x2_t q2tmp0, q2tmp1; 74 int16x8x2_t q2tmp2, q2tmp3; 75 int16x4_t dLow0, dLow1, dHigh0, dHigh1; 76 77 d28 = d29 = d30 = d31 = vdup_n_s32(0); 78 79 // load dq 80 q0 = vld1q_s16(dq); 81 dq += 8; 82 q1 = vld1q_s16(dq); 83 84 // load q 85 q2 = vld1q_s16(q); 86 vst1q_s16(q, qEmpty); 87 q += 8; 88 q3 = vld1q_s16(q); 89 vst1q_s16(q, qEmpty); 90 q += 8; 91 q4 = vld1q_s16(q); 92 vst1q_s16(q, qEmpty); 93 q += 8; 94 q5 = vld1q_s16(q); 95 vst1q_s16(q, qEmpty); 96 97 // load src from dst 98 dst0 = dst; 99 dst1 = dst + 4; 100 d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); 101 dst0 += stride; 102 d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); 103 dst1 += stride; 104 d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); 105 dst0 += stride; 106 d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); 107 dst1 += stride; 108 109 d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); 110 dst0 += stride; 111 d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); 112 dst1 += stride; 113 d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); 114 d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); 115 116 q2 = vmulq_s16(q2, q0); 117 q3 = vmulq_s16(q3, q1); 118 q4 = vmulq_s16(q4, q0); 119 q5 = vmulq_s16(q5, q1); 120 121 // vswp 122 dLow0 = vget_low_s16(q2); 123 dHigh0 = vget_high_s16(q2); 124 dLow1 = vget_low_s16(q4); 125 dHigh1 = vget_high_s16(q4); 126 q2 = vcombine_s16(dLow0, dLow1); 127 q4 = vcombine_s16(dHigh0, dHigh1); 128 129 dLow0 = vget_low_s16(q3); 130 dHigh0 = vget_high_s16(q3); 131 dLow1 = vget_low_s16(q5); 132 dHigh1 = vget_high_s16(q5); 133 q3 = vcombine_s16(dLow0, dLow1); 134 q5 = vcombine_s16(dHigh0, dHigh1); 135 136 q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); 137 q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); 138 q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); 139 q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); 140 141 q10 = vqaddq_s16(q2, q3); 142 q11 = vqsubq_s16(q2, q3); 143 144 q8 = vshrq_n_s16(q8, 1); 145 q9 = vshrq_n_s16(q9, 1); 146 147 q4 = vqaddq_s16(q4, q8); 148 q5 = vqaddq_s16(q5, q9); 149 150 q2 = vqsubq_s16(q6, q5); 151 q3 = vqaddq_s16(q7, q4); 152 153 q4 = vqaddq_s16(q10, q3); 154 q5 = vqaddq_s16(q11, q2); 155 q6 = vqsubq_s16(q11, q2); 156 q7 = vqsubq_s16(q10, q3); 157 158 q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); 159 q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); 160 q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), 161 vreinterpretq_s16_s32(q2tmp1.val[0])); 162 q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), 163 vreinterpretq_s16_s32(q2tmp1.val[1])); 164 165 // loop 2 166 q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); 167 q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); 168 q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); 169 q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); 170 171 q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); 172 q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); 173 174 q10 = vshrq_n_s16(q10, 1); 175 q11 = vshrq_n_s16(q11, 1); 176 177 q10 = vqaddq_s16(q2tmp2.val[1], q10); 178 q11 = vqaddq_s16(q2tmp3.val[1], q11); 179 180 q8 = vqsubq_s16(q8, q11); 181 q9 = vqaddq_s16(q9, q10); 182 183 q4 = vqaddq_s16(q2, q9); 184 q5 = vqaddq_s16(q3, q8); 185 q6 = vqsubq_s16(q3, q8); 186 q7 = vqsubq_s16(q2, q9); 187 188 q4 = vrshrq_n_s16(q4, 3); 189 q5 = vrshrq_n_s16(q5, 3); 190 q6 = vrshrq_n_s16(q6, 3); 191 q7 = vrshrq_n_s16(q7, 3); 192 193 q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); 194 q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); 195 q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), 196 vreinterpretq_s16_s32(q2tmp1.val[0])); 197 q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), 198 vreinterpretq_s16_s32(q2tmp1.val[1])); 199 200 q4 = vreinterpretq_s16_u16( 201 vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); 202 q5 = vreinterpretq_s16_u16( 203 vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); 204 q6 = vreinterpretq_s16_u16( 205 vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); 206 q7 = vreinterpretq_s16_u16( 207 vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); 208 209 d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); 210 d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); 211 d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); 212 d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); 213 214 dst0 = dst; 215 dst1 = dst + 4; 216 vst1_lane_s32((int32_t *)dst0, d28, 0); 217 dst0 += stride; 218 vst1_lane_s32((int32_t *)dst1, d28, 1); 219 dst1 += stride; 220 vst1_lane_s32((int32_t *)dst0, d29, 0); 221 dst0 += stride; 222 vst1_lane_s32((int32_t *)dst1, d29, 1); 223 dst1 += stride; 224 225 vst1_lane_s32((int32_t *)dst0, d30, 0); 226 dst0 += stride; 227 vst1_lane_s32((int32_t *)dst1, d30, 1); 228 dst1 += stride; 229 vst1_lane_s32((int32_t *)dst0, d31, 0); 230 vst1_lane_s32((int32_t *)dst1, d31, 1); 231 return; 232 } 233 234 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, 235 int stride, char *eobs) { 236 int i; 237 238 for (i = 0; i < 4; ++i) { 239 if (((short *)(eobs))[0]) { 240 if (((short *)eobs)[0] & 0xfefe) 241 idct_dequant_full_2x_neon(q, dq, dst, stride); 242 else 243 idct_dequant_0_2x_neon(q, dq[0], dst, stride); 244 } 245 246 if (((short *)(eobs))[1]) { 247 if (((short *)eobs)[1] & 0xfefe) 248 idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride); 249 else 250 idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride); 251 } 252 q += 64; 253 dst += 4 * stride; 254 eobs += 4; 255 } 256 } 257 258 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, 259 unsigned char *dst_u, 260 unsigned char *dst_v, int stride, 261 char *eobs) { 262 if (((short *)(eobs))[0]) { 263 if (((short *)eobs)[0] & 0xfefe) 264 idct_dequant_full_2x_neon(q, dq, dst_u, stride); 265 else 266 idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); 267 } 268 269 q += 32; 270 dst_u += 4 * stride; 271 272 if (((short *)(eobs))[1]) { 273 if (((short *)eobs)[1] & 0xfefe) 274 idct_dequant_full_2x_neon(q, dq, dst_u, stride); 275 else 276 idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); 277 } 278 279 q += 32; 280 281 if (((short *)(eobs))[2]) { 282 if (((short *)eobs)[2] & 0xfefe) 283 idct_dequant_full_2x_neon(q, dq, dst_v, stride); 284 else 285 idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); 286 } 287 288 q += 32; 289 dst_v += 4 * stride; 290 291 if (((short *)(eobs))[3]) { 292 if (((short *)eobs)[3] & 0xfefe) 293 idct_dequant_full_2x_neon(q, dq, dst_v, stride); 294 else 295 idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); 296 } 297 } 298