1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart 18 #define END(f) .fnend; .size f, .-f; 19 20 .eabi_attribute 25,1 @Tag_ABI_align8_preserved 21 .arm 22 23 .macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1 24 25 vmov.s32 r6, r7, \src 26 27 add r6, r6, r3 28 add r7, r7, r3 29 30 vld1.u8 d16, [r6], r4 31 vld1.u8 d17, [r7], r4 32 33 vld1.u8 d18, [r6], r5 34 vld1.u8 d19, [r7], r5 35 36 vdup.u8 d6, \yr0 37 vdup.u8 d7, \yr1 38 /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */ 39 vshll.u8 q12, d16, #8 40 vshll.u8 q13, d17, #8 41 vmlsl.u8 q12, d16, d6 42 vmlsl.u8 q13, d17, d7 43 vmlal.u8 q12, d18, d6 44 vmlal.u8 q13, d19, d7 45 46 vld1.u8 d18, [r6] 47 vld1.u8 d19, [r7] 48 49 sub r6, r6, r4 50 sub r7, r7, r4 51 52 vld1.u8 d16, [r6] 53 vld1.u8 d17, [r7] 54 55 /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */ 56 vshll.u8 q14, d16, #8 57 vshll.u8 q15, d17, #8 58 vmlsl.u8 q14, d16, d6 59 vmlsl.u8 q15, d17, d7 60 vmlal.u8 q14, d18, d6 61 vmlal.u8 q15, d19, d7 62 63 /* Z interpolate, lane 0 q12/q14 -> q10 */ 64 vshll.u16 q8, d24, #8 65 vshll.u16 q9, d25, #8 66 vmlsl.u16 q8, d24, \zr0 67 vmlsl.u16 q9, d25, \zr0 68 vmlal.u16 q8, d28, \zr0 69 vmlal.u16 q9, d29, \zr0 70 vrshrn.u32 d20, q8, #8 71 vrshrn.u32 d21, q9, #8 72 73 /* Z interpolate, lane 1 q13/q15 -> q11 */ 74 vshll.u16 q8, d26, #8 75 vshll.u16 q9, d27, #8 76 vmlsl.u16 q8, d26, \zr1 77 vmlsl.u16 q9, d27, \zr1 78 vmlal.u16 q8, d30, \zr1 79 vmlal.u16 q9, d31, \zr1 80 vrshrn.u32 d22, q8, #8 81 vrshrn.u32 d23, q9, #8 82 83 /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */ 84 vshll.u16 q8, d20, #8 85 vshll.u16 q9, d22, #8 86 vmlsl.u16 q8, d20, \xr0 87 vmlsl.u16 q9, d22, \xr1 88 vmlal.u16 q8, d21, \xr0 89 vmlal.u16 q9, d23, \xr1 90 vshrn.u32 d28, q8, #8 91 vshrn.u32 d29, q9, #8 92 93 /* pack lanes 0-1 -> d12 */ 94 vqrshrn.u16 \dst, q14, #8 95 .endm 96 97 /* void rsdIntrinsic3DLUT_K( 98 * void *dst, // r0 99 * void const *in, // r1 100 * size_t count, // r2 101 * void const *lut, // r3 102 * int32_t pitchy, // [sp] 103 * int32_t pitchz, // [sp+#4] 104 * int dimx, // [sp+#8] 105 * int dimy, // [sp+#12] 106 * int dimz); // [sp+#16] 107 */ 108 ENTRY(rsdIntrinsic3DLUT_K) 109 push {r4,r5,r6,r7} 110 ldr r4, [sp, #16] 111 ldr r5, [sp, #20] 112 ldr r6, [sp, #24] 113 ldr r7, [sp, #28] 114 ldr r12, [sp, #32] 115 vpush {d8-d15} 116 117 vmov.u8 d8, #1 118 vmov.u16 d8[0], r6 119 vmov.u16 d8[1], r7 120 vmov.u16 d8[2], r12 121 vmov.s32 d9, r4, r5 122 123 subs r2, #8 124 bge 2f 125 cmp r2, #-8 126 ble 9f 127 b 4f 128 129 .align 6 130 1: vst4.u8 {d12,d13,d14,d15}, [r0]! 131 /* r0 = dst 132 * r1 = src 133 * r2 = count 134 * r3 = lut 135 * r4 = pitchy 136 * r5 = pitchz 137 * r6 = offset0 138 * r7 = offset1 139 */ 140 2: vld4.u8 {d0,d2,d4,d6}, [r1]! 141 3: vmov d10, d6 142 /* q0,q1,q2,q5 source data 143 * q4 dimensions and pitches 144 * q3, scratch register for scalar access 145 */ 146 vmov q3, q4 147 vmovl.u8 q0, d0 148 vmovl.u8 q1, d2 149 vmovl.u8 q2, d4 150 vmul.u16 q0, q0, d6[0] 151 vmul.u16 q1, q1, d6[1] 152 vmul.u16 q2, q2, d6[2] 153 154 /* vrsra.u16 below would be more accurate, but this can result in a dim.0 case 155 * where we try to read from the limit of the array and the limit +1 to 156 * interpolate, even though the fractional component is zero. Strictly this is 157 * correct, except for the llegal access problem. 158 */ 159 vsra.u16 q0, q0, #8 160 vsra.u16 q1, q1, #8 161 vsra.u16 q2, q2, #8 162 163 vshr.u16 q12, q0, #8 164 vshr.u16 q13, q1, #8 165 vshr.u16 q14, q2, #8 166 167 vbic.u16 q0, #0xff00 168 vmovn.u16 d2, q1 169 vbic.u16 q2, #0xff00 170 171 /* q0,d2,q2 fractional offset 172 * q12,q13,q14 integer offset 173 */ 174 175 vshll.u16 q6, d24, #2 176 vshll.u16 q7, d25, #2 177 vmovl.u16 q8, d26 178 vmovl.u16 q9, d27 179 vmovl.u16 q10, d28 180 vmovl.u16 q11, d29 181 vmla.s32 q6, q8, d9[0] 182 vmla.s32 q7, q9, d9[0] 183 vmla.s32 q6, q10, d9[1] 184 vmla.s32 q7, q11, d9[1] 185 186 /* q6,q7 list of table offsets */ 187 188 /* lanes 0 and 1 */ 189 lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1] 190 191 /* lanes 2 and 3 */ 192 lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3] 193 194 /* lanes 4 and 5 */ 195 lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1] 196 197 /* lanes 6 and 7 */ 198 lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3] 199 200 vuzp.u8 d12, d13 201 vuzp.u8 d14, d15 202 vuzp.u8 d12, d14 203 vuzp.u8 d13, d15 204 205 subs r2, r2, #8 206 vmov.u8 d15, d10 207 208 bge 1b 209 210 cmp r2, #-8 211 blt 1f 212 213 vst4.u8 {d12,d13,d14,d15}, [r0]! 214 215 beq 9f 216 217 /* fill the vector with a safe value */ 218 4: vld1.u32 {d0[]}, [r1] 219 vmov d2, d0 220 vmov d4, d0 221 vmov d6, d0 222 tst r2, #4 223 beq 2f 224 vld1.u32 {d0}, [r1]! 225 vld1.u32 {d2}, [r1]! 226 2: tst r2, #2 227 beq 2f 228 vld1.u32 {d4}, [r1]! 229 2: tst r2, #1 230 beq 2f 231 vld1.u32 {d6[0]}, [r1]! 232 2: vuzp.8 d0, d2 233 vuzp.8 d4, d6 234 vuzp.8 d0, d4 235 vuzp.8 d2, d6 236 b 3b 237 238 1: vzip.8 d12, d14 239 vzip.8 d13, d15 240 vzip.8 d12, d13 241 vzip.8 d14, d15 242 tst r2, #4 243 beq 2f 244 vst1.u32 {d12,d13}, [r0]! 245 2: tst r2, #2 246 beq 2f 247 vst1.u32 {d14}, [r0]! 248 2: tst r2, #1 249 beq 9f 250 vst1.u32 {d15[0]}, [r0]! 251 252 9: mov r0, #0 253 vpop {d8-d15} 254 pop {r4,r5,r6,r7} 255 bx lr 256 END(rsdIntrinsic3DLUT_K) 257