1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @/** 20 @******************************************************************************* 21 @* 22 @* @brief 23 @* interprediction luma function for copy 24 @* 25 @* @par description: 26 @* copies the array of width 'wd' and height 'ht' from the location pointed 27 @* by 'src' to the location pointed by 'dst' 28 @* 29 @* @param[in] pu1_src 30 @* uword8 pointer to the source 31 @* 32 @* @param[out] pu1_dst 33 @* uword8 pointer to the destination 34 @* 35 @* @param[in] src_strd 36 @* integer source stride 37 @* 38 @* @param[in] dst_strd 39 @* integer destination stride 40 @* 41 @* @param[in] pi1_coeff 42 @* word8 pointer to the filter coefficients 43 @* 44 @* @param[in] ht 45 @* integer height of the array 46 @* 47 @* @param[in] wd 48 @* integer width of the array 49 @* 50 @* @returns 51 @* 52 @* @remarks 53 @* none 54 @* 55 @******************************************************************************* 56 @*/ 57 @void ihevc_inter_pred_luma_copy ( 58 @ uword8 *pu1_src, 59 @ uword8 *pu1_dst, 60 @ word32 src_strd, 61 @ word32 dst_strd, 62 @ word8 *pi1_coeff, 63 @ word32 ht, 64 @ word32 wd ) 65 66 @**************variables vs registers***************************************** 67 @ r0 => *pu1_src 68 @ r1 => *pu1_dst 69 @ r2 => src_strd 70 @ r3 => dst_strd 71 @ r7 => ht 72 @ r12 => wd 73 74 .text 75 .align 4 76 77 78 79 80 .globl ihevc_inter_pred_luma_copy_a9q 81 82 .type ihevc_inter_pred_luma_copy_a9q, %function 83 84 ihevc_inter_pred_luma_copy_a9q: 85 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 86 ldr r12,[sp,#48] @loads wd 87 ldr r7,[sp,#44] @loads ht 88 cmp r7,#0 @checks ht == 0 89 ble end_loops 90 tst r12,#15 @checks wd for multiples for 4 & 8 91 beq core_loop_wd_16 92 tst r12,#7 @checks wd for multiples for 4 & 8 93 beq core_loop_wd_8 94 sub r11,r12,#4 95 96 outer_loop_wd_4: 97 subs r4,r12,#0 @checks wd == 0 98 ble end_inner_loop_wd_4 99 100 inner_loop_wd_4: 101 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 102 add r5,r0,r2 @pu1_src_tmp += src_strd 103 add r6,r1,r3 @pu1_dst_tmp += dst_strd 104 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 105 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 106 add r0,r0,#4 @pu1_src += 4 107 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 108 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 109 subs r4,r4,#4 @(wd -4) 110 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 111 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 112 add r1,r1,#4 @pu1_dst += 4 113 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 114 115 bgt inner_loop_wd_4 116 117 end_inner_loop_wd_4: 118 subs r7,r7,#4 @ht - 4 119 sub r0,r5,r11 @pu1_src = pu1_src_tmp 120 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 121 bgt outer_loop_wd_4 122 123 end_loops: 124 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 125 126 127 core_loop_wd_8: 128 sub r11,r12,#8 129 130 outer_loop_wd_8: 131 subs r4,r12,#0 @checks wd 132 ble end_inner_loop_wd_8 133 134 inner_loop_wd_8: 135 add r5,r0,r2 @pu1_src_tmp += src_strd 136 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 137 add r6,r1,r3 @pu1_dst_tmp += dst_strd 138 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 139 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 140 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 141 subs r4,r4,#8 @wd - 8(loop condition) 142 vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp) 143 vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 144 vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp) 145 vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 146 bgt inner_loop_wd_8 147 148 end_inner_loop_wd_8: 149 subs r7,r7,#4 @ht -= 4 150 sub r0,r5,r11 @pu1_src = pu1_src_tmp 151 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 152 bgt outer_loop_wd_8 153 154 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 155 156 core_loop_wd_16: 157 sub r11,r12,#16 158 159 outer_loop_wd_16: 160 subs r4,r12,#0 @checks wd 161 ble end_inner_loop_wd_16 162 163 inner_loop_wd_16: 164 add r5,r0,r2 @pu1_src_tmp += src_strd 165 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 166 add r6,r1,r3 @pu1_dst_tmp += dst_strd 167 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 168 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 169 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 170 subs r4,r4,#16 @wd - 8(loop condition) 171 vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp) 172 vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 173 vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp) 174 vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 175 bgt inner_loop_wd_16 176 177 end_inner_loop_wd_16: 178 subs r7,r7,#4 @ht -= 4 179 sub r0,r5,r11 @pu1_src = pu1_src_tmp 180 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 181 bgt outer_loop_wd_16 182 183 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 184 185 186 187 188 189