1 /* 2 * 3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License") 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * @file csc_tiled_to_linear_uv.s 20 * @brief SEC_OMX specific define. It support MFC 6.x tiled. 21 * @author ShinWon Lee (shinwon.lee (at) samsung.com) 22 * @version 1.0 23 * @history 24 * 2012.02.01 : Create 25 */ 26 27 /* 28 * Converts tiled data to linear for mfc 6.x tiled 29 * 1. uv of nv12t to y of yuv420s 30 * 31 * @param dst 32 * uv address of yuv420s[out] 33 * 34 * @param src 35 * uv address of nv12t[in] 36 * 37 * @param yuv420_width 38 * real width of yuv420s[in] 39 * 40 * @param yuv420_height 41 * real height of yuv420s[in] 42 * 43 */ 44 .arch armv7-a 45 .text 46 .global csc_tiled_to_linear_uv_neon 47 .type csc_tiled_to_linear_uv_neon, %function 48 csc_tiled_to_linear_uv_neon: 49 .fnstart 50 51 .equ CACHE_LINE_SIZE, 64 52 .equ PRE_LOAD_OFFSET, 6 53 54 @r0 y_dst 55 @r1 y_src 56 @r2 width 57 @r3 height 58 @r4 temp3 59 @r5 i 60 @r6 j 61 @r7 dst_offset 62 @r8 src_offset 63 @r9 aligned_height 64 @r10 aligned_width 65 @r11 tiled_width 66 @r12 temp1 67 @r14 temp2 68 69 stmfd sp!, {r4-r12,r14} @ backup registers 70 ldr r4, [sp, #40] @ r4 = height 71 72 bic r9, r3, #0x7 @ aligned_height = height & (~0xF) 73 bic r10, r2, #0xF @ aligned_width = width & (~0xF) 74 add r11, r2, #15 @ tiled_width = ((width + 15) >> 4) << 4 75 mov r11, r11, asr #4 76 mov r11, r11, lsl #4 77 78 mov r5, #0 79 LOOP_MAIN_ALIGNED_HEIGHT: 80 mul r8, r11, r5 @ src_offset = tiled_width * i 81 mov r6, #0 82 add r8, r1, r8 @ src_offset = y_src + src_offset 83 LOOP_MAIN_ALIGNED_WIDTH: 84 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 85 vld1.8 {q0, q1}, [r8]! 86 mul r12, r2, r5 @ temp1 = width * i + j; 87 vld1.8 {q2, q3}, [r8]! 88 add r12, r12, r6 89 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 90 vld1.8 {q4, q5}, [r8]! 91 add r7, r0, r12 @ dst_offset = y_dst + temp1 92 vld1.8 {q6, q7}, [r8]! 93 94 vst1.8 {q0}, [r7], r2 95 vst1.8 {q1}, [r7], r2 96 vst1.8 {q2}, [r7], r2 97 vst1.8 {q3}, [r7], r2 98 vst1.8 {q4}, [r7], r2 99 vst1.8 {q5}, [r7], r2 100 vst1.8 {q6}, [r7], r2 101 vst1.8 {q7}, [r7], r2 102 add r6, r6, #16 103 cmp r6, r10 104 blt LOOP_MAIN_ALIGNED_WIDTH 105 106 MAIN_REMAIN_WIDTH_START: 107 cmp r10, r2 @ if (aligned_width != width) { 108 beq MAIN_REMAIN_WIDTH_END 109 110 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3); 111 add r8, r8, r6, lsl #3 112 add r8, r1, r8 @ r8 = y_src + src_offset 113 114 mul r12, r2, r5 @ temp1 = width * i + j; 115 add r12, r12, r6 116 add r7, r0, r12 @ r7 = y_dst + temp1 117 sub r14, r2, r6 @ r14 = width - j 118 119 stmfd sp!, {r0-r1} @ backup registers 120 mov r1, #0 121 LOOP_MAIN_REMAIN_HEIGHT: 122 mov r0, #0 @ r0 is index in memcpy 123 LOOP_MAIN_REMAIN_WIDTH: 124 ldrh r4, [r8], #2 125 strh r4, [r7], #2 126 add r0, #2 127 cmp r0, r14 128 blt LOOP_MAIN_REMAIN_WIDTH 129 130 sub r8, r8, r14 131 sub r7, r7, r14 132 add r8, r8, #16 133 add r7, r7, r2 134 135 add r1, #1 136 cmp r1, #8 137 blt LOOP_MAIN_REMAIN_HEIGHT 138 ldmfd sp!, {r0-r1} @ restore registers 139 MAIN_REMAIN_WIDTH_END: 140 141 add r5, r5, #8 142 cmp r5, r9 143 blt LOOP_MAIN_ALIGNED_HEIGHT 144 145 REMAIN_HEIGHT_START: 146 cmp r9, r3 @ if (aligned_height != height) { 147 beq REMAIN_HEIGHT_END 148 149 mov r6, #0 150 LOOP_REMAIN_HEIGHT_WIDTH16: 151 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3) 152 add r8, r8, r6, lsl #3 153 add r8, r1, r8 @ src_offset = y_src + src_offset 154 155 mul r12, r2, r5 @ temp1 = width * i + j; 156 add r12, r12, r6 157 add r7, r0, r12 @ r7 = y_dst + temp1 158 159 sub r12, r3, r9 160 mov r14, #0 161 LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1: 162 vld1.8 {q0}, [r8]! 163 vld1.8 {q1}, [r8]! 164 vst1.8 {q0}, [r7], r2 165 vst1.8 {q1}, [r7], r2 166 167 add r14, r14, #2 168 cmp r14, r12 169 blt LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1 170 171 add r6, r6, #16 172 cmp r6, r10 173 blt LOOP_REMAIN_HEIGHT_WIDTH16 174 175 REMAIN_HEIGHT_REMAIN_WIDTH_START: 176 cmp r10, r2 177 beq REMAIN_HEIGHT_REMAIN_WIDTH_END 178 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3) 179 add r8, r8, r6, lsl #3 180 add r8, r1, r8 @ src_offset = y_src + src_offset 181 182 mul r12, r2, r5 @ temp1 = width * i + j; 183 add r12, r12, r6 184 add r7, r0, r12 @ r7 = y_dst + temp1 185 186 stmfd sp!, {r0-r1,r3} @ backup registers 187 mov r0, #0 188 sub r1, r3, r9 189 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1: 190 191 sub r14, r2, r6 192 mov r4, #0 193 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx: 194 ldrh r3, [r8], #2 195 strh r3, [r7], #2 196 add r4, #2 197 cmp r4, r14 198 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx 199 200 sub r8, r8, r14 201 sub r7, r7, r14 202 add r8, r8, #16 203 add r7, r7, r2 204 205 add r0, r0, #1 206 cmp r0, r1 207 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1 208 ldmfd sp!, {r0-r1,r3} @ restore registers 209 210 REMAIN_HEIGHT_REMAIN_WIDTH_END: 211 212 REMAIN_HEIGHT_END: 213 214 RESTORE_REG: 215 ldmfd sp!, {r4-r12,r15} @ restore registers 216 217 .fnend 218