1 /* 2 * 3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License") 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * @file csc_tiled_to_linear_y.s 20 * @brief SEC_OMX specific define. It support MFC 6.x tiled. 21 * @author ShinWon Lee (shinwon.lee (at) samsung.com) 22 * @version 1.0 23 * @history 24 * 2012.02.01 : Create 25 */ 26 27 /* 28 * Converts tiled data to linear for mfc 6.x 29 * 1. Y of NV12T to Y of YUV420P 30 * 2. Y of NV12T to Y of YUV420S 31 * 32 * @param dst 33 * Y address of YUV420[out] 34 * 35 * @param src 36 * Y address of NV12T[in] 37 * 38 * @param yuv420_width 39 * real width of YUV420[in]. It should be even. 40 * 41 * @param yuv420_height 42 * real height of YUV420[in] It should be even. 43 * 44 */ 45 .arch armv7-a 46 .text 47 .global csc_tiled_to_linear_y_neon 48 .type csc_tiled_to_linear_y_neon, %function 49 csc_tiled_to_linear_y_neon: 50 .fnstart 51 52 .equ CACHE_LINE_SIZE, 64 53 .equ PRE_LOAD_OFFSET, 6 54 55 @r0 y_dst 56 @r1 y_src 57 @r2 width 58 @r3 height 59 @r4 temp3 60 @r5 i 61 @r6 j 62 @r7 dst_offset 63 @r8 src_offset 64 @r9 aligned_height 65 @r10 aligned_width 66 @r11 tiled_width 67 @r12 temp1 68 @r14 temp2 69 70 stmfd sp!, {r4-r12,r14} @ backup registers 71 ldr r4, [sp, #40] @ r4 = height 72 73 bic r9, r3, #0xF @ aligned_height = height & (~0xF) 74 bic r10, r2, #0xF @ aligned_width = width & (~0xF) 75 add r11, r2, #15 @ tiled_width = ((width + 15) >> 4) << 4 76 mov r11, r11, asr #4 77 mov r11, r11, lsl #4 78 79 mov r5, #0 80 LOOP_MAIN_ALIGNED_HEIGHT: 81 mul r8, r11, r5 @ src_offset = tiled_width * i 82 mov r6, #0 83 add r8, r1, r8 @ src_offset = y_src + src_offset 84 LOOP_MAIN_ALIGNED_WIDTH: 85 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 86 vld1.8 {q0, q1}, [r8]! 87 vld1.8 {q2, q3}, [r8]! 88 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 89 vld1.8 {q4, q5}, [r8]! 90 vld1.8 {q6, q7}, [r8]! 91 mul r12, r2, r5 @ temp1 = width * i + j; 92 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 93 vld1.8 {q8, q9}, [r8]! 94 add r12, r12, r6 95 vld1.8 {q10, q11}, [r8]! 96 add r7, r0, r12 @ dst_offset = y_dst + temp1 97 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)] 98 vld1.8 {q12, q13}, [r8]! 99 vld1.8 {q14, q15}, [r8]! 100 101 vst1.8 {q0}, [r7], r2 102 vst1.8 {q1}, [r7], r2 103 vst1.8 {q2}, [r7], r2 104 vst1.8 {q3}, [r7], r2 105 vst1.8 {q4}, [r7], r2 106 vst1.8 {q5}, [r7], r2 107 vst1.8 {q6}, [r7], r2 108 vst1.8 {q7}, [r7], r2 109 vst1.8 {q8}, [r7], r2 110 vst1.8 {q9}, [r7], r2 111 vst1.8 {q10}, [r7], r2 112 vst1.8 {q11}, [r7], r2 113 vst1.8 {q12}, [r7], r2 114 vst1.8 {q13}, [r7], r2 115 add r6, r6, #16 116 vst1.8 {q14}, [r7], r2 117 cmp r6, r10 118 vst1.8 {q15}, [r7], r2 119 blt LOOP_MAIN_ALIGNED_WIDTH 120 121 MAIN_REMAIN_WIDTH_START: 122 cmp r10, r2 @ if (aligned_width != width) { 123 beq MAIN_REMAIN_WIDTH_END 124 125 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 4); 126 add r8, r8, r6, lsl #4 127 add r8, r1, r8 @ r8 = y_src + src_offset 128 129 mul r12, r2, r5 @ temp1 = width * i + j; 130 add r12, r12, r6 131 add r7, r0, r12 @ r7 = y_dst + temp1 132 sub r14, r2, r6 @ r14 = width - j 133 134 stmfd sp!, {r0-r1} @ backup registers 135 mov r1, #0 136 LOOP_MAIN_REMAIN_HEIGHT: 137 mov r0, #0 @ r0 is index in memcpy 138 LOOP_MAIN_REMAIN_WIDTH: 139 ldrh r4, [r8], #2 140 strh r4, [r7], #2 141 add r0, #2 142 cmp r0, r14 143 blt LOOP_MAIN_REMAIN_WIDTH 144 145 sub r8, r8, r14 146 sub r7, r7, r14 147 add r8, r8, #16 148 add r7, r7, r2 149 150 add r1, #1 151 cmp r1, #16 152 blt LOOP_MAIN_REMAIN_HEIGHT 153 ldmfd sp!, {r0-r1} @ restore registers 154 MAIN_REMAIN_WIDTH_END: 155 156 add r5, r5, #16 157 cmp r5, r9 158 blt LOOP_MAIN_ALIGNED_HEIGHT 159 160 REMAIN_HEIGHT_START: 161 cmp r9, r3 @ if (aligned_height != height) { 162 beq REMAIN_HEIGHT_END 163 164 mov r6, #0 165 LOOP_REMAIN_HEIGHT_WIDTH16: 166 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 4) 167 add r8, r8, r6, lsl #4 168 add r8, r1, r8 @ src_offset = y_src + src_offset 169 170 mul r12, r2, r5 @ temp1 = width * i + j; 171 add r12, r12, r6 172 add r7, r0, r12 @ r7 = y_dst + temp1 173 174 sub r12, r3, r9 175 mov r14, #0 176 LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1: 177 vld1.8 {q0}, [r8]! 178 vld1.8 {q1}, [r8]! 179 vst1.8 {q0}, [r7], r2 180 vst1.8 {q1}, [r7], r2 181 182 add r14, r14, #2 183 cmp r14, r12 184 blt LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1 185 186 add r6, r6, #16 187 cmp r6, r10 188 blt LOOP_REMAIN_HEIGHT_WIDTH16 189 190 REMAIN_HEIGHT_REMAIN_WIDTH_START: 191 cmp r10, r2 192 beq REMAIN_HEIGHT_REMAIN_WIDTH_END 193 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 4) 194 add r8, r8, r6, lsl #4 195 add r8, r1, r8 @ src_offset = y_src + src_offset 196 197 mul r12, r2, r5 @ temp1 = width * i + j; 198 add r12, r12, r6 199 add r7, r0, r12 @ r7 = y_dst + temp1 200 201 stmfd sp!, {r0-r1,r3} @ backup registers 202 mov r0, #0 203 sub r1, r3, r9 204 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1: 205 206 sub r14, r2, r6 207 mov r4, #0 208 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx: 209 ldrh r3, [r8], #2 210 strh r3, [r7], #2 211 add r4, #2 212 cmp r4, r14 213 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx 214 215 sub r8, r8, r14 216 sub r7, r7, r14 217 add r8, r8, #16 218 add r7, r7, r2 219 220 add r0, r0, #1 221 cmp r0, r1 222 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1 223 ldmfd sp!, {r0-r1,r3} @ restore registers 224 225 REMAIN_HEIGHT_REMAIN_WIDTH_END: 226 227 REMAIN_HEIGHT_END: 228 229 RESTORE_REG: 230 ldmfd sp!, {r4-r12,r15} @ restore registers 231 232 .fnend 233