1 2 .arch armv7-a 3 .text 4 .global csc_ARGB8888_to_YUV420SP_NEON 5 .type csc_ARGB8888_to_YUV420SP_NEON, %function 6 csc_ARGB8888_to_YUV420SP_NEON: 7 .fnstart 8 9 @r0 pDstY 10 @r1 pDstUV 11 @r2 pSrcRGB 12 @r3 nWidth 13 @r4 pDstY2 = pDstY + nWidth 14 @r5 pSrcRGB2 = pSrcRGB + nWidthx2 15 @r6 temp7, nWidth16m 16 @r7 temp6, accumilator 17 @r8 temp5, nWidthTemp 18 @r9 temp4, Raw RGB565 19 @r10 temp3, r,g,b 20 @r11 temp2, immediate operand 21 @r12 temp1, nHeight 22 @r14 temp0, debugging pointer 23 24 .equ CACHE_LINE_SIZE, 32 25 .equ PRE_LOAD_OFFSET, 6 26 27 stmfd sp!, {r4-r12,r14} @ backup registers 28 ldr r12, [sp, #40] @ load nHeight 29 @ldr r14, [sp, #44] @ load pTest 30 add r4, r0, r3 @r4: pDstY2 = pDstY + nWidth 31 add r5, r2, r3, lsl #2 @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4 32 sub r8, r3, #16 @r8: nWidthTmp = nWidth -16 33 34 @q0: temp1, R 35 @q1: temp2, GB 36 @q2: R 37 @q3: G 38 @q4: B 39 @q5: temp3, output 40 41 42 vmov.u16 q6, #66 @coefficient assignment 43 vmov.u16 q7, #129 44 vmov.u16 q8, #25 45 vmov.u16 q9, #0x8080 @ 128<<8 + 128 46 47 vmov.u16 q10, #0x1000 @ 16<<8 + 128 48 vorr.u16 q10, #0x0080 49 50 vmov.u16 q11, #38 @#-38 51 vmov.u16 q12, #74 @#-74 52 vmov.u16 q13, #112 53 vmov.u16 q14, #94 @#-94 54 vmov.u16 q15, #18 @#-18 55 56 57 58 59 LOOP_NHEIGHT2: 60 stmfd sp!, {r12} @ backup registers 61 62 LOOP_NWIDTH16: 63 pld [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 64 @-------------------------------------------YUV ------------------------------------------ 65 vmov.u16 q14, #94 @#94 66 vmov.u16 q15, #18 @#18 67 vld4.8 {d0,d1,d2,d3}, [r2]! @loadRGB interleavely 68 vld4.8 {d4,d5,d6,d7}, [r2]! @loadRGB interleavely 69 70 71 vmov.u16 d8,d2 72 vmov.u16 d9,d6 73 vmov.u16 d10,d1 74 vmov.u16 d11,d5 75 vmov.u16 d12,d0 76 vmov.u16 d13,d4 77 78 vand.u16 q4,#0x00FF @R 79 vand.u16 q5,#0x00FF @G 80 vand.u16 q6,#0x00FF @B 81 82 vmov.u16 q8,q9 @ CalcU() 83 vmla.u16 q8,q6,q13 @112 * B[k] 84 vmls.u16 q8,q4,q11 @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2 85 vmls.u16 q8,q5,q12 @-(74 * G[k]) 86 vshr.u16 q8,q8, #8 @(128<<8+ 128 + u)>>8 87 88 vmov.u16 q7,q9 @CalcV() 89 vmla.u16 q7,q4,q13 @112 * R[k] 90 vmls.u16 q7,q5,q14 @q0:U -(94 * G[k]) @128<<6+ 32 + v>>2 91 vmls.u16 q7,q6,q15 @-(18 * B[k]) 92 vshr.u16 q7,q7, #8 @(128<<8+ 128 + v)>>8 93 94 95 vtrn.8 q8,q7 96 vst1.8 {q8}, [r1]! @write UV component to yuv420_buffer+linear_ylanesiez 97 98 @-------------------------------------------Y ------------------------------------------ 99 100 vmov.u16 q14, #66 @#66 101 vmov.u16 q15, #129 @#129 102 vmov.u16 q8, #25 @#25 103 104 @CalcY_Y() 105 106 vmul.u16 q7,q4,q14 @q0 = 66 *R[k] 107 vmla.u16 q7,q5,q15 @q0 += 129 *G[k] 108 vmla.u16 q7,q6,q8 @q0 += 25 *B[k] 109 110 vadd.u16 q7,q7,q10 111 vshr.u16 q7,q7, #8 112 113 vmov.u16 d8,d2 114 vmov.u16 d9,d6 115 vmov.u16 d10,d1 116 vmov.u16 d11,d5 117 vmov.u16 d12,d0 118 vmov.u16 d13,d4 119 120 vshr.u16 q4,q4,#8 @R 121 vshr.u16 q5,q5,#8 @G 122 vshr.u16 q6,q6,#8 @B 123 124 vmul.u16 q0,q4,q14 @q0 = 66 *R[k] 125 vmla.u16 q0,q5,q15 @q0 += 129 *G[k] 126 vmla.u16 q0,q6,q8 @q0 += 25 *B[k] 127 vadd.u16 q0,q0,q10 128 vshr.u16 q0,q0, #8 129 130 vtrn.8 q7,q0 131 vst1.8 {q7}, [r0]!@write to Y to yuv420_buffer 132 133 134 135 @-------------------------------------------Y ------------------------------------------ 136 137 @---------------------------------------------Y1------------------------------------------- 138 139 pld [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 140 vld4.8 {d0,d1,d2,d3}, [r5]! @loadRGB interleavely 141 vld4.8 {d4,d5,d6,d7}, [r5]! @loadRGB interleavely 142 143 vmov.u16 d8,d2 144 vmov.u16 d9,d6 145 vmov.u16 d10,d1 146 vmov.u16 d11,d5 147 vmov.u16 d12,d0 148 vmov.u16 d13,d4 149 150 151 vand.u16 q4,#0x00FF @R 152 vand.u16 q5,#0x00FF @G 153 vand.u16 q6,#0x00FF @B 154 155 156 157 vmul.u16 q7,q4,q14 @q0 = 66 *R[k] 158 vmla.u16 q7,q5,q15 @q0 += 129 *G[k] 159 vmla.u16 q7,q6,q8 @q0 += 25 *B[k] 160 vadd.u16 q7,q7,q10 161 vshr.u16 q7,q7, #8 162 163 vmov.u16 d8,d2 164 vmov.u16 d9,d6 165 vmov.u16 d10,d1 166 vmov.u16 d11,d5 167 vmov.u16 d12,d0 168 vmov.u16 d13,d4 169 170 vshr.u16 q4,q4,#8 @R 171 vshr.u16 q5,q5,#8 @G 172 vshr.u16 q6,q6,#8 @B 173 174 vmul.u16 q0,q4,q14 @q0 = 66 *R[k] 175 vmla.u16 q0,q5,q15 @q0 += 129 *G[k] 176 vmla.u16 q0,q6,q8 @q0 += 25 *B[k] 177 vadd.u16 q0,q0,q10 178 vshr.u16 q0,q0, #8 179 180 vtrn.8 q7,q0 181 vst1.8 {q7}, [r4]!@write to Y to yuv420_buffer 182 183 subs r8,r8,#16 @nWidth16-- 184 BPL LOOP_NWIDTH16 @if nWidth16>0 185 @-----------------------------------unaligned --------------------------------------- 186 187 adds r8,r8,#16 @ + 16 - 2 188 BEQ NO_UNALIGNED @in case that nWidht is multiple of 16 189 LOOP_NWIDTH2: 190 @----------------------------------pDstRGB1--Y------------------------------------------ 191 @stmfd sp!, {r14} @backup r14 192 193 194 ldr r9, [r2], #4 @loadRGB int 195 ldr r12, [r2], #4 @loadRGB int 196 197 mov r10, r9,lsr #16 @copy to r10 198 mov r14, r12 @copy to r10 199 200 ldr r6, =0x000000FF 201 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10; 202 ldr r6, =0x00FF0000 203 and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10; 204 add r10,r10,r14 205 206 mov r11, #66 @accumilator += R*66 207 mul r7, r10, r11 208 209 mov r10, r9,lsr #8 @copy to r10 210 mov r14, r12,lsl #8 @copy to r10 211 212 ldr r6, =0x000000FF 213 and r10, r10, r6 @G: 214 ldr r6, =0x00FF0000 215 and r14, r14, r6 @G: 216 add r10,r10,r14 217 218 mov r11, #129 @accumilator += G *129 219 mla r7, r10, r11, r7 220 221 mov r10, r9 @copy to r10 222 mov r14, r12,lsl #16 @copy to r10 223 224 ldr r6, =0x000000FF 225 and r10, r10, r6 @B 226 ldr r6, =0x00FF0000 227 and r14, r14, r6 @B 228 add r10,r10,r14 229 230 mov r11, #25 @accumilator 1 -= B *25 231 mla r7, r10, r11, r7 232 233 ldr r6, =0x10801080 234 add r7, r6 235 236 lsr r7, #8 237 strb r7, [r0],#1 238 lsr r7,#16 239 strb r7, [r0],#1 240 @ldmfd sp!, {r14} @load r14 241 242 243 @----------------------------------pDstRGB2--UV------------------------------------------ 244 245 mov r10, r9 @copy to r10 246 ldr r7,=0x00008080 247 mov r12,r7 248 249 ldr r6, =0x000000FF 250 and r10, r10, r6 @B: 251 252 mov r11, #112 @accumilator += B*112 253 mla r7, r10, r11, r7 254 255 256 mov r11, #18 @accumilator -= B*18 257 mul r11, r10, r11 258 sub r12, r12, r11 259 260 261 262 263 mov r10, r9, lsr #16 @copy to r10 264 ldr r6, =0x000000FF 265 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10; 266 267 mov r11, #38 @accumilator -= R *38 268 mul r11, r10, r11 269 sub r7, r7, r11 270 271 mov r11, #112 @accumilator = R *112 272 mla r12, r10, r11, r12 273 274 mov r10, r9,lsr #8 @copy to r10 275 ldr r6, =0x000000FF 276 and r10, r10, r6 @G: (rgbIn[k] & 0x07E0) >> 5; 277 278 mov r11, #74 @accumilator -= G*74 279 mul r11, r10, r11 280 sub r7, r7, r11 281 282 mov r11, #94 @accumilator -= G*94 283 mul r11, r10, r11 284 sub r12, r12, r11 285 286 lsr r7, #8 @ >>8 287 strb r7, [r1],#1 288 lsr r12, #8 @ >>8 289 strb r12, [r1],#1 290 291 @----------------------------------pDstRGB2--Y------------------------------------------ 292 @stmfd sp!, {r14} @backup r14 293 294 295 ldr r9, [r5], #4 @loadRGB int 296 ldr r12, [r5], #4 @loadRGB int 297 298 mov r10, r9,lsr #16 @copy to r10 299 mov r14, r12 @copy to r10 300 301 ldr r6, =0x000000FF 302 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10; 303 ldr r6, =0x00FF0000 304 and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10; 305 add r10,r10,r14 306 307 mov r11, #66 @accumilator += R*66 308 mul r7, r10, r11 309 310 mov r10, r9,lsr #8 @copy to r10 311 mov r14, r12,lsl #8 @copy to r10 312 313 ldr r6, =0x000000FF 314 and r10, r10, r6 @G: 315 ldr r6, =0x00FF0000 316 and r14, r14, r6 @G: 317 add r10,r10,r14 318 319 mov r11, #129 @accumilator += G *129 320 mla r7, r10, r11, r7 321 322 mov r10, r9 @copy to r10 323 mov r14, r12,lsl #16 @copy to r10 324 325 ldr r6, =0x000000FF 326 and r10, r10, r6 @B 327 ldr r6, =0x00FF0000 328 and r14, r14, r6 @B 329 add r10,r10,r14 330 331 332 333 334 mov r11, #25 @accumilator 1 -= B *25 335 mla r7, r10, r11, r7 336 337 ldr r6, =0x10801080 338 add r7, r6 339 lsr r7, #8 340 341 strb r7, [r4],#1 342 lsr r7,#16 343 strb r7, [r4],#1 344 @ldmfd sp!, {r14} @load r14 345 346 347 subs r8,r8,#2 @ nWidth2 -= 2 348 BGT LOOP_NWIDTH2 @ if nWidth2>0 349 350 351 NO_UNALIGNED: @in case that nWidht is multiple of 16 352 353 @----------------------------------------------------------------------------- 354 sub r8, r3, #16 @r8: nWidthTmp = nWidth -16 355 add r0, r0, r3 @pDstY + nwidth 356 add r2, r2, r3, lsl #2 @pSrcRGB + nwidthx4 357 add r4, r4, r3 @pDstY2 + nwidth 358 add r5, r5, r3, lsl #2 @pSrcRGB2 + nwidthx4 359 360 ldmfd sp!, {r12} 361 subs r12,r12,#2 @nHeight -=2 362 BGT LOOP_NHEIGHT2 @if nHeight2>0 363 364 ldmfd sp!, {r4-r12,pc} @ backup registers 365 .fnend 366