1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18 #define END(f) .size f, .-f; 19 20 21 .macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1 22 23 smov x6, \src0 24 smov x7, \src1 25 26 add x6, x6, x3 27 add x7, x7, x3 28 29 ld1 {v16.2s}, [x6], x4 30 ld1 {v17.2s}, [x7], x4 31 32 ld1 {v18.2s}, [x6], x5 33 ld1 {v19.2s}, [x7], x5 34 35 dup v8.8b, \yr0 36 dup v9.8b, \yr1 37 /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */ 38 zip1 v12.16b, v5.16b, v16.16b 39 zip1 v13.16b, v5.16b, v17.16b 40 umlsl v12.8h, v16.8b, v8.8b 41 umlsl v13.8h, v17.8b, v9.8b 42 umlal v12.8h, v18.8b, v8.8b 43 umlal v13.8h, v19.8b, v9.8b 44 45 ld1 {v18.2s}, [x6] 46 ld1 {v19.2s}, [x7] 47 48 sub x6, x6, x4 49 sub x7, x7, x4 50 51 ld1 {v16.2s}, [x6] 52 ld1 {v17.2s}, [x7] 53 54 /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */ 55 zip1 v14.16b, v5.16b, v16.16b 56 zip1 v15.16b, v5.16b, v17.16b 57 umlsl v14.8h, v16.8b, v8.8b 58 umlsl v15.8h, v17.8b, v9.8b 59 umlal v14.8h, v18.8b, v8.8b 60 umlal v15.8h, v19.8b, v9.8b 61 62 /* Z interpolate, lane 0 v12/v14 -> v10 */ 63 ushll v8.4s, v12.4h, #8 64 ushll2 v9.4s, v12.8h, #8 65 umlsl v8.4s, v12.4h, \zr0 66 umlsl2 v9.4s, v12.8h, \zr0 67 umlal v8.4s, v14.4h, \zr0 68 umlal2 v9.4s, v14.8h, \zr0 69 rshrn v10.4h, v8.4s, #8 70 rshrn2 v10.8h, v9.4s, #8 71 72 /* Z interpolate, lane 1 v13/v15 -> v11 */ 73 ushll v8.4s, v13.4h, #8 74 ushll2 v9.4s, v13.8h, #8 75 umlsl v8.4s, v13.4h, \zr1 76 umlsl2 v9.4s, v13.8h, \zr1 77 umlal v8.4s, v15.4h, \zr1 78 umlal2 v9.4s, v15.8h, \zr1 79 rshrn v11.4h, v8.4s, #8 80 rshrn2 v11.8h, v9.4s, #8 81 82 /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */ 83 ushll v8.4s, v10.4h, #8 84 ushll v9.4s, v11.4h, #8 85 umlsl v8.4s, v10.4h, \xr0 86 umlsl v9.4s, v11.4h, \xr1 87 umlal2 v8.4s, v10.8h, \xr0 88 umlal2 v9.4s, v11.8h, \xr1 89 shrn v14.4h, v8.4s, #8 90 shrn2 v14.8h, v9.4s, #8 91 92 /* pack lanes 0-1 -> v6 */ 93 .ifc \dst, v20.16b 94 uqrshrn2 \dst, v14.8h, #8 95 .else ; .ifc \dst, v21.16b 96 uqrshrn2 \dst, v14.8h, #8 97 .else 98 uqrshrn \dst, v14.8h, #8 99 .endif ; .endif 100 .endm 101 102 /* void rsdIntrinsic3DLUT_K( 103 * void *dst, // x0 104 * void const *in, // x1 105 * size_t count, // x2 106 * void const *lut, // x3 107 * int32_t pitchy, // w4 108 * int32_t pitchz, // w5 109 * int dimx, // w6 110 * int dimy, // w7 111 * int dimz); // [sp] 112 */ 113 ENTRY(rsdIntrinsic3DLUT_K) 114 ldr w8, [sp] 115 stp d8, d9, [sp, #-64]! 116 stp d10, d11, [sp, #16] 117 stp d12, d13, [sp, #32] 118 stp d14, d15, [sp, #48] 119 movi v4.8b, #1 120 ins v4.h[0], w6 121 ins v4.h[1], w7 122 ins v4.h[2], w8 123 ins v4.s[2], w4 124 ins v4.s[3], w5 125 movi v5.16b, #0 126 127 subs x2, x2, #8 128 bge 2f 129 cmp x2, #-8 130 ble 9f 131 b 4f 132 133 .align 6 134 1: st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32 135 /* x0 = dst 136 * x1 = src 137 * x2 = count 138 * x3 = lut 139 * x4 = pitchy 140 * x5 = pitchz 141 * x6 = offset0 142 * x7 = offset1 143 */ 144 2: ld4 {v0.8b-v3.8b}, [x1], #32 145 /* v0,v1,v2,v3 source data 146 * v4 dimensions and pitches 147 */ 148 3: uxtl v0.8h, v0.8b 149 uxtl v1.8h, v1.8b 150 uxtl v2.8h, v2.8b 151 mul v0.8h, v0.8h, v4.h[0] 152 mul v1.8h, v1.8h, v4.h[1] 153 mul v2.8h, v2.8h, v4.h[2] 154 155 /* ursra below would be more accurate, but this can result in a dim.0 case 156 * where we try to read from the limit of the array and the limit +1 to 157 * interpolate, even though the fractional component is zero. Strictly this is 158 * correct, except for the llegal access problem. 159 */ 160 usra v0.8h, v0.8h, #8 161 usra v1.8h, v1.8h, #8 162 usra v2.8h, v2.8h, #8 163 164 ushr v12.8h, v0.8h, #8 165 ushr v13.8h, v1.8h, #8 166 ushr v14.8h, v2.8h, #8 167 bic v0.8h, #0xff, LSL #8 168 xtn v1.8b, v1.8h 169 bic v2.8h, #0xff, LSL #8 170 171 /* v0.8h,v1.8b,v2.hb fractional offset 172 * v12.8h,v13.8h,v14.8h integer offset 173 */ 174 175 ushll v6.4s, v12.4h, #2 176 ushll2 v7.4s, v12.8h, #2 177 uxtl v8.4s, v13.4h 178 uxtl2 v9.4s, v13.8h 179 uxtl v10.4s, v14.4h 180 uxtl2 v11.4s, v14.8h 181 mla v6.4s, v8.4s, v4.s[2] 182 mla v7.4s, v9.4s, v4.s[2] 183 mla v6.4s, v10.4s, v4.s[3] 184 mla v7.4s, v11.4s, v4.s[3] 185 186 /* v6,v7 list of table offsets */ 187 188 /* lanes 0 and 1 */ 189 lanepair dst=v20.8b, src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1] 190 191 /* lanes 2 and 3 */ 192 lanepair dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3] 193 194 /* lanes 4 and 5 */ 195 lanepair dst=v21.8b, src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5] 196 197 /* lanes 6 and 7 */ 198 lanepair dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7] 199 200 uzp1 v6.16b, v20.16b, v21.16b 201 uzp2 v7.16b, v20.16b, v21.16b 202 uzp1 v20.16b, v6.16b, v7.16b 203 uzp2 v22.16b, v6.16b, v7.16b 204 mov v21.d[0], v20.d[1] 205 206 subs x2, x2, #8 207 mov v23.8b, v3.8b 208 209 bge 1b 210 211 cmp x2, #-8 212 blt 1f 213 214 st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32 215 beq 9f 216 217 /* fill the vector with a safe value */ 218 4: ld4r {v0.8b-v3.8b}, [x1] 219 tbz x2, #2, 2f 220 ld4 {v0.b-v3.b}[0], [x1], #4 221 ld4 {v0.b-v3.b}[1], [x1], #4 222 ld4 {v0.b-v3.b}[2], [x1], #4 223 ld4 {v0.b-v3.b}[3], [x1], #4 224 2: tbz x2, #1, 2f 225 ld4 {v0.b-v3.b}[4], [x1], #4 226 ld4 {v0.b-v3.b}[5], [x1], #4 227 2: tbz x2, #0, 2f 228 ld4 {v0.b-v3.b}[6], [x1], #4 229 2: b 3b 230 231 1: tst x2, #4 232 beq 2f 233 st4 {v20.b-v23.b}[0], [x0], #4 234 st4 {v20.b-v23.b}[1], [x0], #4 235 st4 {v20.b-v23.b}[2], [x0], #4 236 st4 {v20.b-v23.b}[3], [x0], #4 237 2: tst x2, #2 238 beq 2f 239 st4 {v20.b-v23.b}[4], [x0], #4 240 st4 {v20.b-v23.b}[5], [x0], #4 241 2: tst x2, #1 242 beq 9f 243 st4 {v20.b-v23.b}[6], [x0], #4 244 245 9: ldp d14, d15, [sp, #48] 246 ldp d12, d13, [sp, #32] 247 ldp d10, d11, [sp, #16] 248 ldp d8, d9, [sp], #64 249 ret 250 END(rsdIntrinsic3DLUT_K) 251