1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @******************************************************************************* 20 @* @file 21 @* ihevc_inter_pred_chroma_copy_neon.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* yogeswaran rs 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @******************************************************************************* 39 @*/ 40 @/** 41 @******************************************************************************* 42 @* 43 @* @brief 44 @* chroma interprediction filter for copy 45 @* 46 @* @par description: 47 @* copies the array of width 'wd' and height 'ht' from the location pointed 48 @* by 'src' to the location pointed by 'dst' 49 @* 50 @* @param[in] pu1_src 51 @* uword8 pointer to the source 52 @* 53 @* @param[out] pu1_dst 54 @* uword8 pointer to the destination 55 @* 56 @* @param[in] src_strd 57 @* integer source stride 58 @* 59 @* @param[in] dst_strd 60 @* integer destination stride 61 @* 62 @* @param[in] pi1_coeff 63 @* word8 pointer to the filter coefficients 64 @* 65 @* @param[in] ht 66 @* integer height of the array 67 @* 68 @* @param[in] wd 69 @* integer width of the array 70 @* 71 @* @returns 72 @* 73 @* @remarks 74 @* none 75 @* 76 @******************************************************************************* 77 @*/ 78 79 @void ihevc_inter_pred_chroma_copy( uword8 *pu1_src, 80 @ uword8 *pu1_dst, 81 @ word32 src_strd, 82 @ word32 dst_strd, 83 @ word8 *pi1_coeff, 84 @ word32 ht, 85 @ word32 wd) 86 @**************variables vs registers***************************************** 87 @ r0 => *pu1_src 88 @ r1 => *pu1_dst 89 @ r2 => src_strd 90 @ r3 => dst_strd 91 @ r4 => *pi1_coeff 92 @ r5 => ht 93 @ r6 => wd 94 95 .text 96 .align 4 97 98 99 100 101 .globl ihevc_inter_pred_chroma_copy_a9q 102 103 .type ihevc_inter_pred_chroma_copy_a9q, %function 104 105 ihevc_inter_pred_chroma_copy_a9q: 106 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 107 ldr r12,[sp,#48] @loads wd 108 lsl r12,r12,#1 109 ldr r7,[sp,#44] @loads ht 110 cmp r7,#0 @checks ht == 0 111 ble end_loops 112 and r8,r7,#3 @check ht for mul of 2 113 sub r7,r7,r8 @check the rounded height value 114 tst r12,#15 @checks wd for multiples for 4 & 8 115 beq core_loop_wd_16 116 tst r12,#7 @checks wd for multiples for 4 & 8 117 beq core_loop_wd_8 118 119 sub r11,r12,#4 120 cmp r7,#0 121 beq outer_loop_wd_4_ht_2 122 123 outer_loop_wd_4: 124 subs r4,r12,#0 @checks wd == 0 125 ble end_inner_loop_wd_4 126 127 inner_loop_wd_4: 128 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 129 add r5,r0,r2 @pu1_src_tmp += src_strd 130 add r6,r1,r3 @pu1_dst_tmp += dst_strd 131 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 132 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 133 add r0,r0,#4 @pu1_src += 4 134 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 135 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 136 subs r4,r4,#4 @(wd -4) 137 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 138 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 139 add r1,r1,#4 @pu1_dst += 4 140 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 141 bgt inner_loop_wd_4 142 143 end_inner_loop_wd_4: 144 subs r7,r7,#4 @ht - 4 145 sub r0,r5,r11 @pu1_src = pu1_src_tmp 146 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 147 bgt outer_loop_wd_4 148 cmp r8,#0 149 bgt outer_loop_wd_4_ht_2 150 151 end_loops: 152 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 153 154 155 outer_loop_wd_4_ht_2: 156 subs r4,r12,#0 @checks wd == 0 157 ble end_loops 158 159 inner_loop_wd_4_ht_2: 160 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 161 add r5,r0,r2 @pu1_src_tmp += src_strd 162 add r6,r1,r3 @pu1_dst_tmp += dst_strd 163 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 164 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 165 add r0,r0,#4 @pu1_src += 4 166 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 167 subs r4,r4,#4 @(wd -4) 168 add r1,r1,#4 @pu1_dst += 4 169 bgt inner_loop_wd_4_ht_2 170 b end_loops 171 172 core_loop_wd_8: 173 sub r11,r12,#8 174 cmp r7,#0 175 beq outer_loop_wd_8_ht_2 176 177 outer_loop_wd_8: 178 subs r4,r12,#0 @checks wd 179 ble end_inner_loop_wd_8 180 181 inner_loop_wd_8: 182 add r5,r0,r2 @pu1_src_tmp += src_strd 183 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 184 add r6,r1,r3 @pu1_dst_tmp += dst_strd 185 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 186 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 187 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 188 subs r4,r4,#8 @wd - 8(loop condition) 189 vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp) 190 vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 191 vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp) 192 vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 193 bgt inner_loop_wd_8 194 195 end_inner_loop_wd_8: 196 subs r7,r7,#4 @ht -= 4 197 sub r0,r5,r11 @pu1_src = pu1_src_tmp 198 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 199 bgt outer_loop_wd_8 200 cmp r8,#0 201 bgt outer_loop_wd_8_ht_2 202 b end_loops 203 204 outer_loop_wd_8_ht_2: 205 subs r4,r12,#0 @checks wd 206 ble end_loops 207 208 inner_loop_wd_8_ht_2: 209 add r5,r0,r2 @pu1_src_tmp += src_strd 210 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 211 add r6,r1,r3 @pu1_dst_tmp += dst_strd 212 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 213 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 214 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 215 @subs r4,r4,#8 @wd - 8(loop condition) 216 @bgt inner_loop_wd_8_ht_2 217 b end_loops 218 219 core_loop_wd_16: 220 sub r11,r12,#16 221 cmp r7,#0 222 beq outer_loop_wd_16_ht_2 223 224 outer_loop_wd_16: 225 subs r4,r12,#0 @checks wd 226 ble end_inner_loop_wd_16 227 228 inner_loop_wd_16: 229 add r5,r0,r2 @pu1_src_tmp += src_strd 230 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 231 add r6,r1,r3 @pu1_dst_tmp += dst_strd 232 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 233 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 234 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 235 subs r4,r4,#16 @wd - 16(loop condition) 236 vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp) 237 vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 238 vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp) 239 vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 240 bgt inner_loop_wd_16 241 242 end_inner_loop_wd_16: 243 subs r7,r7,#4 @ht -= 4 244 sub r0,r5,r11 @pu1_src = pu1_src_tmp 245 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 246 bgt outer_loop_wd_16 247 cmp r8,#0 248 bgt outer_loop_wd_16_ht_2 249 b end_loops 250 251 outer_loop_wd_16_ht_2: 252 subs r4,r12,#0 @checks wd 253 ble end_loops 254 255 inner_loop_wd_16_ht_2: 256 add r5,r0,r2 @pu1_src_tmp += src_strd 257 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 258 add r6,r1,r3 @pu1_dst_tmp += dst_strd 259 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 260 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 261 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 262 @subs r4,r4,#16 @wd - 16(loop condition) 263 @bgt inner_loop_wd_16_ht_2 264 265 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 266 267 268 269 270 271