1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18 #define END(f) .size f, .-f; 19 20 /* Perform the actual YuvToRGB conversion in a macro, from register to 21 * register. This macro will be called from within several different wrapper 22 * variants for different data layouts. Y data starts with the even and odd 23 * bytes split into the low parts of v8 and v9 respectively. U and V are in 24 * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is 25 * pre-loaded with a constant 0xff alpha channel. 26 * 27 * The complicated arithmetic is the result of refactoring the original 28 * equations to avoid 16-bit overflow without losing any precision. 29 */ 30 .macro yuvkern 31 movi v7.8b, #149 32 33 umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149 34 umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149 35 36 movi v7.8b, #50 37 movi v10.8b, #104 38 umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104 39 umlal v8.8h, v17.8b, v10.8b 40 41 ushr v7.8b, v17.8b, #1 42 uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1) 43 uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1) 44 45 ushll v7.8h, v16.8b, #2 46 add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2) 47 add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2) 48 49 movi v7.16b, #204 50 movi v10.8b, #254 51 umull v11.8h, v17.8b, v7.8b // r2 = v * 204 52 umull v12.8h, v16.8b, v10.8b // b2 = u * 254 53 54 uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1 55 uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1 56 uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 57 uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 58 uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1 59 uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1 60 61 uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 62 uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 63 uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) 64 uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2) 65 uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 66 uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 67 68 uqrshrn v0.8b, v0.8h, #6 69 uqrshrn v4.8b, v4.8h, #6 70 uqrshrn v1.8b, v1.8h, #7 71 uqrshrn v5.8b, v5.8h, #7 72 uqrshrn v2.8b, v2.8h, #6 73 uqrshrn v6.8b, v6.8h, #6 74 75 zip1 v0.16b, v0.16b, v4.16b 76 zip1 v1.16b, v1.16b, v5.16b 77 zip1 v2.16b, v2.16b, v6.16b 78 .endm 79 80 /* Define the wrapper code which will load and store the data, iterate the 81 * correct number of times, and safely handle the remainder at the end of the 82 * loop. Some sections of code are switched out depending on the data packing 83 * being handled. 84 */ 85 .macro wrap_line kernel, interleaved=0, swapuv=0 86 87 mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 88 dup v13.8h, w5 89 mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 90 dup v14.8h, w5 91 mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) 92 dup v15.8h, w5 93 94 movi v3.16b, #0xff 95 96 subs x2, x2, #16 97 bhs 1f 98 b 2f 99 100 .align 4 101 1: ld2 {v8.8b,v9.8b}, [x1], #16 102 // prfm PLDL1STRM, [x1, #256] 103 .if \interleaved 104 .if \swapuv 105 ld2 {v17.8b,v18.8b}, [x3], #16 106 mov v16.8b, v18.8b 107 .else 108 ld2 {v16.8b,v17.8b}, [x3], #16 109 .endif 110 // prfm PLD1STRM, [x3, #256] 111 .else 112 ld1 {v16.8b}, [x3], #8 113 ld1 {v17.8b}, [x4], #8 114 // prfm PLD1STRM, [x3, #128] 115 // prfm PLD1STRM, [x4, #128] 116 .endif 117 118 \kernel 119 120 subs x2, x2, #16 121 122 st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 123 124 bhs 1b 125 126 2: adds x2, x2, #16 127 beq 2f 128 129 /* To handle the tail portion of the data (something less than 16 130 * bytes) load small power-of-two chunks into working registers. It 131 * doesn't matter where they end up in the register; the same process 132 * will store them back out using the same positions and the 133 * interaction between neighbouring pixels is constrained to odd 134 * boundaries where the load operations don't interfere. 135 */ 136 movi v8.8b, #0 137 movi v9.8b, #0 138 movi v16.8b, #0 139 movi v17.8b, #0 140 141 tbz x2, #3, 1f 142 ld1 {v9.8b}, [x1], #8 143 .if \interleaved 144 ld1 {v17.8b}, [x3], #8 145 .else 146 ld1 {v16.s}[1], [x3], #4 147 ld1 {v17.s}[1], [x4], #4 148 .endif 149 1: tbz x2, #2, 1f 150 ld1 {v8.s}[1], [x1], #4 151 .if \interleaved 152 ld1 {v16.s}[1], [x3], #4 153 .else 154 ld1 {v16.h}[1], [x3], #2 155 ld1 {v17.h}[1], [x4], #2 156 .endif 157 1: tbz x2, #1, 1f 158 ld1 {v8.h}[1], [x1], #2 159 .if \interleaved 160 ld1 {v16.h}[1], [x3], #2 161 .else 162 ld1 {v16.b}[1], [x3], #1 163 ld1 {v17.b}[1], [x4], #1 164 .endif 165 1: tbz x2, #0, 1f 166 ld1 {v8.b}[1], [x1], #1 167 .if \interleaved 168 ld1 {v16.h}[0], [x3], #2 169 .else 170 ld1 {v16.b}[0], [x3], #1 171 ld1 {v17.b}[0], [x4], #1 172 .endif 173 174 /* One small impediment in the process above is that some of the load 175 * operations can't perform byte-wise structure deinterleaving at the 176 * same time as loading only part of a register. So the data is loaded 177 * linearly and unpacked manually at this point if necessary. 178 */ 179 1: uzp1 v8.16b, v8.16b, v9.16b 180 .if \interleaved 181 .if \swapuv 182 uzp1 v16.16b, v17.16b, v16.16b 183 .else 184 uzp1 v16.16b, v16.16b, v17.16b 185 .endif 186 .endif 187 188 \kernel 189 190 /* As above but with the output; structured stores for partial vectors 191 * aren't available, so the data is re-packed first and stored linearly. 192 */ 193 zip1 v4.16b, v0.16b, v2.16b 194 zip2 v6.16b, v0.16b, v2.16b 195 zip1 v5.16b, v1.16b, v3.16b 196 zip2 v7.16b, v1.16b, v3.16b 197 zip1 v0.16b, v4.16b, v5.16b 198 zip2 v1.16b, v4.16b, v5.16b 199 zip1 v2.16b, v6.16b, v7.16b 200 zip2 v3.16b, v6.16b, v7.16b 201 202 1: tbz x2, #3, 1f 203 st1 {v2.16b,v3.16b}, [x0], #32 204 1: tbz x2, #2, 1f 205 st1 {v1.16b}, [x0], #16 206 1: tbz x2, #1, 1f 207 st1 {v0.d}[1], [x0], #8 208 1: tbz x2, #0, 2f 209 st1 {v0.s}[1], [x0], #4 210 2: 211 .endm 212 213 214 /* void rsdIntrinsicYuv2_K( 215 * void *out, // x0 216 * void const *yin, // x1 217 * void const *uin, // x2 218 * void const *vin, // x3 219 * size_t xstart, // x4 220 * size_t xend); // x5 221 */ 222 ENTRY(rsdIntrinsicYuv2_K) 223 lsr x6, x4, #1 224 add x0, x0, x4, LSL #2 225 add x1, x1, x4 226 add x4, x3, x6 227 add x3, x2, x6 228 sub x2, x5, x6, LSL #2 229 230 sub x6, sp, #32 231 sub sp, sp, #64 232 st1 {v8.1d - v11.1d}, [sp] 233 st1 {v12.1d - v15.1d}, [x6] 234 235 wrap_line yuvkern, 0 236 237 ld1 {v8.1d - v11.1d}, [sp], #32 238 ld1 {v12.1d - v15.1d}, [sp], #32 239 ret 240 END(rsdIntrinsicYuv2_K) 241 242 /* void rsdIntrinsicYuv_K( 243 * void *out, // x0 244 * void const *yin, // x1 245 * void const *uvin, // x2 246 * size_t xstart, // x3 247 * size_t xend); // x4 248 */ 249 ENTRY(rsdIntrinsicYuv_K) 250 bic x5, x3, #1 251 add x0, x0, x5, LSL #2 252 add x1, x1, x5 253 add x3, x2, x5 254 sub x2, x4, x5 255 256 sub x5, sp, #32 257 sub sp, sp, #64 258 st1 {v8.1d - v11.1d}, [sp] 259 st1 {v12.1d - v15.1d}, [x5] 260 261 wrap_line yuvkern, 1, 1 262 263 ld1 {v8.1d - v11.1d}, [sp], #32 264 ld1 {v12.1d - v15.1d}, [sp], #32 265 ret 266 END(rsdIntrinsicYuv_K) 267 268 /* void rsdIntrinsicYuvR_K( 269 * void *out, // x0 270 * void const *yin, // x1 271 * void const *uvin, // x2 272 * size_t xstart, // x3 273 * size_t xend); // x4 274 */ 275 ENTRY(rsdIntrinsicYuvR_K) 276 bic x5, x3, #1 277 add x0, x0, x5, LSL #2 278 add x1, x1, x5 279 add x3, x2, x5 280 sub x2, x4, x5 281 282 sub x5, sp, #32 283 sub sp, sp, #64 284 st1 {v8.1d - v11.1d}, [sp] 285 st1 {v12.1d - v15.1d}, [x5] 286 287 wrap_line yuvkern, 1 288 289 ld1 {v8.1d - v11.1d}, [sp], #32 290 ld1 {v12.1d - v15.1d}, [sp], #32 291 ret 292 END(rsdIntrinsicYuvR_K) 293