1 /* 2 * 3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License") 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * @file csc_tiled_to_linear_uv_deinterleave_neon.s 20 * @brief SEC_OMX specific define. It support MFC 6.x tiled. 21 * @author ShinWon Lee (shinwon.lee (at) samsung.com) 22 * @version 1.0 23 * @history 24 * 2012.02.01 : Create 25 */ 26 27 /* 28 * Converts and Deinterleave tiled data to linear for mfc 6.x 29 * 1. UV of NV12T to Y of YUV420P 30 * 31 * @param u_dst 32 * U address of YUV420[out] 33 * 34 * @param v_dst 35 * V address of YUV420[out] 36 * 37 * @param uv_src 38 * UV address of NV12T[in] 39 * 40 * @param yuv420_width 41 * real width of YUV420[in]. It should be even. 42 * 43 * @param yuv420_height 44 * real height of YUV420[in] It should be even. 45 */ 46 47 .arch armv7-a 48 .text 49 .global csc_tiled_to_linear_uv_deinterleave_neon 50 .type csc_tiled_to_linear_uv_deinterleave_neon, %function 51 csc_tiled_to_linear_uv_deinterleave_neon: 52 .fnstart 53 54 .equ CACHE_LINE_SIZE, 64 55 .equ PRE_LOAD_OFFSET, 6 56 57 @r0 u_dst 58 @r1 v_dst 59 @r2 uv_src 60 @r3 width 61 @r4 height 62 @r5 i 63 @r6 j 64 @r7 dst_offset 65 @r8 src_offset 66 @r9 aligned_height 67 @r10 aligned_width 68 @r11 tiled_width 69 @r12 temp1 70 @r14 temp2 71 72 stmfd sp!, {r4-r12,r14} @ backup registers 73 ldr r4, [sp, #40] @ r4 = height 74 75 bic r9, r4, #0x7 @ aligned_height = height & (~0x7) 76 bic r10, r3, #0xF @ aligned_width = width & (~0xF) 77 add r11, r3, #15 @ tiled_width = ((width + 15) >> 4) << 4 78 mov r11, r11, asr #4 79 mov r11, r11, lsl #4 80 81 mov r5, #0 82 LOOP_MAIN_ALIGNED_HEIGHT: 83 mul r8, r11, r5 @ src_offset = tiled_width * i 84 mov r6, #0 85 add r8, r2, r8 @ src_offset = uv_src + src_offset 86 LOOP_MAIN_ALIGNED_WIDTH: 87 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1) 88 mul r12, r12, r5 89 90 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 91 vld2.8 {q0, q1}, [r8]! 92 add r12, r12, r6, asr #1 93 vld2.8 {q2, q3}, [r8]! 94 add r7, r0, r12 @ dst_offset = u_dst + temp1 95 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 96 vld2.8 {q4, q5}, [r8]! 97 mov r14, r3, asr #1 @ temp2 = width / 2 98 vld2.8 {q6, q7}, [r8]! 99 100 vst1.8 {d0}, [r7], r14 101 vst1.8 {d1}, [r7], r14 102 vst1.8 {d4}, [r7], r14 103 vst1.8 {d5}, [r7], r14 104 vst1.8 {d8}, [r7], r14 105 vst1.8 {d9}, [r7], r14 106 vst1.8 {d12}, [r7], r14 107 vst1.8 {d13}, [r7], r14 108 109 add r7, r1, r12 @ dst_offset = v_dst + temp1 110 111 vst1.8 {d2}, [r7], r14 112 vst1.8 {d3}, [r7], r14 113 vst1.8 {d6}, [r7], r14 114 vst1.8 {d7}, [r7], r14 115 vst1.8 {d10}, [r7], r14 116 vst1.8 {d11}, [r7], r14 117 add r6, r6, #16 118 vst1.8 {d14}, [r7], r14 119 cmp r6, r10 120 vst1.8 {d15}, [r7], r14 121 blt LOOP_MAIN_ALIGNED_WIDTH 122 123 MAIN_REMAIN_WIDTH_START: 124 cmp r10, r3 @ if (aligned_width != width) { 125 beq MAIN_REMAIN_WIDTH_END 126 stmfd sp!, {r0-r2,r4} @ backup registers 127 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3) 128 add r8, r8, r6, lsl #3 129 add r8, r2, r8 @ r8 = uv_src + src_offset 130 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1) 131 mul r12, r12, r5 132 add r12, r12, r6, asr #1 133 add r7, r0, r12 @ r7 = u_dst + temp1 134 add r12, r1, r12 @ r12 = v_dst + temp1 135 sub r14, r3, r6 @ r14 = (width - j) / 2 136 mov r14, r14, asr #1 137 138 mov r4, #0 139 LOOP_MAIN_REMAIN_HEIGHT: 140 mov r0, #0 @ r0 is index in de-interleave 141 LOOP_MAIN_REMAIN_WIDTH: 142 ldrb r1, [r8], #1 143 ldrb r2, [r8], #1 144 strb r1, [r7], #1 145 strb r2, [r12], #1 146 add r0, #1 147 cmp r0, r14 148 blt LOOP_MAIN_REMAIN_WIDTH 149 150 sub r8, r8, r14, lsl #1 151 sub r7, r7, r14 152 sub r12, r12, r14 153 add r8, r8, #16 154 add r7, r7, r3, asr #1 155 add r12, r12, r3, asr #1 156 157 add r4, #1 158 cmp r4, #8 159 blt LOOP_MAIN_REMAIN_HEIGHT 160 ldmfd sp!, {r0-r2,r4} @ restore registers 161 MAIN_REMAIN_WIDTH_END: 162 163 add r5, r5, #8 164 cmp r5, r9 165 blt LOOP_MAIN_ALIGNED_HEIGHT 166 167 REMAIN_HEIGHT_START: 168 cmp r9, r4 @ if (aligned_height != height) { 169 beq REMAIN_HEIGHT_END 170 171 mov r6, #0 172 LOOP_REMAIN_HEIGHT_WIDTH16: 173 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3) 174 add r8, r8, r6, lsl #3 175 add r8, r2, r8 @ src_offset = uv_src + src_offset 176 177 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1) 178 mul r12, r12, r5 179 add r12, r12, r6, asr #1 180 add r7, r0, r12 @ r7 = u_dst + temp1 181 add r12, r1, r12 @ r12 = v_dst + temp1 182 mov r14, r3, asr #1 @ temp2 = width / 2 183 184 stmfd sp!, {r0-r1} @ backup registers 185 mov r0, #0 186 sub r1, r4, r9 187 LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1: 188 vld2.8 {d0, d1}, [r8]! 189 vst1.8 {d0}, [r7], r14 190 vst1.8 {d1}, [r12], r14 191 192 add r0, r0, #1 193 cmp r0, r1 194 blt LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1 195 ldmfd sp!, {r0-r1} @ restore registers 196 197 add r6, r6, #16 198 cmp r6, r10 199 blt LOOP_REMAIN_HEIGHT_WIDTH16 200 201 REMAIN_HEIGHT_REMAIN_WIDTH_START: 202 cmp r10, r3 203 beq REMAIN_HEIGHT_REMAIN_WIDTH_END 204 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3) 205 add r8, r8, r6, lsl #3 206 add r8, r2, r8 @ src_offset = uv_src + src_offset 207 208 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1) 209 mul r12, r12, r5 210 add r12, r12, r6, asr #1 211 add r7, r0, r12 @ r7 = u_dst + temp1 212 add r12, r1, r12 @ r12 = v_dst + temp1 213 sub r14, r3, r6 @ r14 = (width - j) /2 214 mov r14, r14, asr #1 215 216 stmfd sp!, {r0-r2,r4-r5} @ backup registers 217 mov r0, #0 218 sub r1, r4, r9 219 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1: 220 221 mov r4, #0 222 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx: 223 ldrb r2, [r8], #1 224 ldrb r5, [r8], #1 225 strb r2, [r7], #1 226 strb r5, [r12], #1 227 add r4, #1 228 cmp r4, r14 229 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx 230 231 sub r8, r8, r14, lsl #1 232 sub r7, r7, r14 233 sub r12, r12, r14 234 add r8, r8, #16 235 add r7, r7, r3, asr #1 236 add r12, r12, r3, asr #1 237 238 add r0, r0, #1 239 cmp r0, r1 240 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1 241 ldmfd sp!, {r0-r2,r4-r5} @ restore registers 242 243 REMAIN_HEIGHT_REMAIN_WIDTH_END: 244 245 REMAIN_HEIGHT_END: 246 247 RESTORE_REG: 248 ldmfd sp!, {r4-r12,r15} @ restore registers 249 250 .fnend 251