1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 //******************************************************************************* 20 //* @file 21 //* ihevc_inter_pred_chroma_copy.s 22 //* 23 //* @brief 24 //* Contains function definitions for inter prediction interpolation. 25 //* Functions are coded using NEON intrinsics and can be compiled using ARM 26 //* RVCT 27 //* 28 //* @author 29 //* Yogeswaran RS 30 //* 31 //* @par List of Functions: 32 //* 33 //* 34 //* @remarks 35 //* None 36 //* 37 //******************************************************************************* 38 //*/ 39 ///** 40 //******************************************************************************* 41 //* 42 //* @brief 43 //* Chroma interprediction filter for copy 44 //* 45 //* @par Description: 46 //* Copies the array of width 'wd' and height 'ht' from the location pointed 47 //* by 'src' to the location pointed by 'dst' 48 //* 49 //* @param[in] pu1_src 50 //* UWORD8 pointer to the source 51 //* 52 //* @param[out] pu1_dst 53 //* UWORD8 pointer to the destination 54 //* 55 //* @param[in] src_strd 56 //* integer source stride 57 //* 58 //* @param[in] dst_strd 59 //* integer destination stride 60 //* 61 //* @param[in] pi1_coeff 62 //* WORD8 pointer to the filter coefficients 63 //* 64 //* @param[in] ht 65 //* integer height of the array 66 //* 67 //* @param[in] wd 68 //* integer width of the array 69 //* 70 //* @returns 71 //* 72 //* @remarks 73 //* None 74 //* 75 //******************************************************************************* 76 //*/ 77 78 //void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src, 79 // UWORD8 *pu1_dst, 80 // WORD32 src_strd, 81 // WORD32 dst_strd, 82 // WORD8 *pi1_coeff, 83 // WORD32 ht, 84 // WORD32 wd) 85 //**************Variables Vs Registers***************************************** 86 //x0 => *pu1_src 87 //x1 => *pu1_dst 88 //x2 => src_strd 89 //x3 => dst_strd 90 //x4 => *pi1_coeff 91 //x5 => ht 92 //x6 => wd 93 94 .text 95 .align 4 96 97 .globl ihevc_inter_pred_chroma_copy_av8 98 99 .type ihevc_inter_pred_chroma_copy_av8, %function 100 101 ihevc_inter_pred_chroma_copy_av8: 102 103 LSL x12,x6,#1 //wd << 1 104 CMP x5,#0 //checks ht == 0 105 BLE END_LOOPS 106 AND x8,x5,#3 //check ht for mul of 2 107 SUB x5,x5,x8 //check the rounded height value 108 TST x12,#15 //checks wd for multiples for 16 109 BEQ CORE_LOOP_WD_16 110 TST x12,#7 //checks wd for multiples for 4 & 8 111 BEQ CORE_LOOP_WD_8 112 SUB x11,x12,#4 113 CMP x5,#0 114 BEQ OUTER_LOOP_WD_4_HT_2 115 116 OUTER_LOOP_WD_4: 117 SUBS x4,x12,#0 //checks wd == 0 118 BLE END_INNER_LOOP_WD_4 119 120 INNER_LOOP_WD_4: 121 LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 122 ADD x7,x0,x2 //pu1_src_tmp += src_strd 123 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 124 ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 125 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 126 ADD x0,x0,#4 //pu1_src += 4 127 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 128 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 129 SUBS x4,x4,#4 //(wd -4) 130 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 131 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 132 ADD x1,x1,#4 //pu1_dst += 4 133 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 134 BGT INNER_LOOP_WD_4 135 136 END_INNER_LOOP_WD_4: 137 SUBS x5,x5,#4 //ht - 4 138 SUB x0,x7,x11 //pu1_src = pu1_src_tmp 139 SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp 140 BGT OUTER_LOOP_WD_4 141 CMP x8,#0 142 BGT OUTER_LOOP_WD_4_HT_2 143 144 END_LOOPS: 145 RET 146 147 OUTER_LOOP_WD_4_HT_2: 148 SUBS x4,x12,#0 //checks wd == 0 149 BLE END_LOOPS 150 151 INNER_LOOP_WD_4_HT_2: 152 LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 153 ADD x7,x0,x2 //pu1_src_tmp += src_strd 154 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 155 ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 156 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 157 ADD x0,x0,#4 //pu1_src += 4 158 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 159 SUBS x4,x4,#4 //(wd -4) 160 ADD x1,x1,#4 //pu1_dst += 4 161 BGT INNER_LOOP_WD_4_HT_2 162 B END_LOOPS 163 164 CORE_LOOP_WD_8: 165 SUB x11,x12,#8 166 CMP x5,#0 167 BEQ OUTER_LOOP_WD_8_HT_2 168 169 OUTER_LOOP_WD_8: 170 SUBS x4,x12,#0 //checks wd 171 BLE END_INNER_LOOP_WD_8 172 173 174 INNER_LOOP_WD_8: 175 ADD x7,x0,x2 //pu1_src_tmp += src_strd 176 LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 177 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 178 ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) 179 LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 180 ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 181 SUBS x4,x4,#8 //wd - 8(Loop condition) 182 LD1 {v2.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 183 ST1 {v2.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 184 LD1 {v3.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 185 ST1 {v3.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 186 BGT INNER_LOOP_WD_8 187 188 END_INNER_LOOP_WD_8: 189 SUBS x5,x5,#4 //ht -= 4 190 SUB x0,x7,x11 //pu1_src = pu1_src_tmp 191 SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp 192 BGT OUTER_LOOP_WD_8 193 CMP x8,#0 194 BGT OUTER_LOOP_WD_8_HT_2 195 B END_LOOPS 196 197 OUTER_LOOP_WD_8_HT_2: 198 SUBS x4,x12,#0 //checks wd 199 BLE END_LOOPS 200 201 INNER_LOOP_WD_8_HT_2: 202 ADD x7,x0,x2 //pu1_src_tmp += src_strd 203 LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 204 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 205 ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) 206 LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 207 ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 208 B END_LOOPS 209 210 CORE_LOOP_WD_16: 211 SUB x11,x12,#16 212 CMP x5,#0 213 BEQ OUTER_LOOP_WD_16_HT_2 214 215 OUTER_LOOP_WD_16: 216 SUBS x4,x12,#0 //checks wd 217 BLE END_INNER_LOOP_WD_16 218 219 INNER_LOOP_WD_16: 220 ADD x7,x0,x2 //pu1_src_tmp += src_strd 221 LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) 222 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 223 ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) 224 LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 225 ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 226 SUBS x4,x4,#16 //wd - 16(Loop condition) 227 LD1 {v2.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 228 ST1 {v2.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 229 LD1 {v3.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 230 ST1 {v3.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 231 BGT INNER_LOOP_WD_16 232 233 END_INNER_LOOP_WD_16: 234 SUBS x5,x5,#4 //ht -= 4 235 SUB x0,x7,x11 //pu1_src = pu1_src_tmp 236 SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp 237 BGT OUTER_LOOP_WD_16 238 CMP x8,#0 239 BGT OUTER_LOOP_WD_16_HT_2 240 B END_LOOPS 241 242 OUTER_LOOP_WD_16_HT_2: 243 SUBS x4,x12,#0 //checks wd 244 BLE END_LOOPS 245 246 INNER_LOOP_WD_16_HT_2: 247 ADD x7,x0,x2 //pu1_src_tmp += src_strd 248 LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) 249 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 250 ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) 251 LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 252 ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 253 254 RET 255 256 257