1 ///***************************************************************************** 2 //* 3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 //* 5 //* Licensed under the Apache License, Version 2.0 (the "License"); 6 //* you may not use this file except in compliance with the License. 7 //* You may obtain a copy of the License at: 8 //* 9 //* http://www.apache.org/licenses/LICENSE-2.0 10 //* 11 //* Unless required by applicable law or agreed to in writing, software 12 //* distributed under the License is distributed on an "AS IS" BASIS, 13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 //* See the License for the specific language governing permissions and 15 //* limitations under the License. 16 //* 17 //*****************************************************************************/ 18 ///** 19 ///******************************************************************************* 20 //* //file 21 //* ihevcd_itrans_recon_dc_chroma.s 22 //* 23 //* //brief 24 //* contains function definitions itrans and recon for dc only case 25 //* 26 //* //author 27 //* ittiam 28 //* 29 //* //par list of functions: 30 //* 31 //* 32 //* //remarks 33 //* none 34 //* 35 //*******************************************************************************/ 36 37 38 .text 39 .include "ihevc_neon_macros.s" 40 41 42 .globl ihevcd_itrans_recon_dc_chroma_av8 43 44 .type ihevcd_itrans_recon_dc_chroma_av8, %function 45 46 ihevcd_itrans_recon_dc_chroma_av8: 47 48 //void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred, 49 // uword8 *pu1_dst, 50 // word32 pred_strd, 51 // word32 dst_strd, 52 // word32 log2_trans_size, 53 // word16 i2_coeff_value) 54 55 //x0:pu1_pred 56 //x1:pu1_dest 57 //x2:pred_strd 58 //x3:dst_strd 59 60 61 62 push_v_regs 63 stp x19, x20,[sp,#-16]! 64 65 sxth x5, w5 // since the argument is of word16, sign extend to x register 66 67 mov x10,#1 68 lsl x4,x10,x4 // trans_size = (1 << log2_trans_size)// 69 mov x6,#64 // 1 << (shift1 - 1)// 70 mov x7,#2048 // 1<<(shift2-1) 71 72 add x8,x6,x5,lsl #6 73 asr x20, x8, #7 74 mov x19,#32767 75 cmp x20,x19 76 blt lbl36 77 mov x8,#32767 78 b lbl36_1 79 lbl36: 80 mov x19,#-32768 81 cmp x20,x19 82 csel x8, x19, x20, lt 83 lbl36_1: 84 85 add x5,x7,x8,lsl #6 86 asr x20, x5, #12 87 mov x19,#32767 88 cmp x20,x19 89 blt lbl38 90 mov x6,#32767 91 b lbl38_1 92 lbl38: 93 mov x19,#-32768 94 cmp x20,x19 95 csel x6, x19, x20, lt 96 lbl38_1: 97 98 mov x9,x4 99 mov x8,x4 100 101 // x6 has the dc_value 102 // x4 has the trans_size value 103 // x8 has the row value 104 // x9 has the col value 105 dup v0.8h,w6 106 cmp x4,#4 107 beq row_loop_4chroma 108 109 110 row_loop_chroma: 111 mov x9,x4 112 113 114 col_loop_chroma: 115 116 mov x7,x0 117 ld2 {v2.8b, v3.8b},[x7],x2 118 ld2 {v4.8b, v5.8b},[x7],x2 119 ld2 {v6.8b, v7.8b},[x7],x2 120 ld2 {v8.8b, v9.8b},[x7],x2 121 122 ld2 {v10.8b, v11.8b},[x7],x2 123 ld2 {v12.8b, v13.8b},[x7],x2 124 ld2 {v14.8b, v15.8b},[x7],x2 125 ld2 {v16.8b, v17.8b},[x7] 126 127 add x0,x0,#16 128 129 130 uaddw v30.8h, v0.8h , v2.8b 131 uaddw v28.8h, v0.8h , v4.8b 132 uaddw v26.8h, v0.8h , v6.8b 133 uaddw v24.8h, v0.8h , v8.8b 134 uaddw v22.8h, v0.8h , v10.8b 135 uaddw v20.8h, v0.8h , v12.8b 136 uaddw v18.8h, v0.8h , v14.8b 137 138 139 mov x11,x1 140 sqxtun v2.8b, v30.8h 141 sqxtun v4.8b, v28.8h 142 sqxtun v6.8b, v26.8h 143 sqxtun v8.8b, v24.8h 144 145 uaddw v30.8h, v0.8h , v16.8b 146 147 sqxtun v10.8b, v22.8h 148 sqxtun v12.8b, v20.8h 149 sqxtun v14.8b, v18.8h 150 sqxtun v16.8b, v30.8h 151 152 st2 {v2.8b, v3.8b},[x11],x3 153 st2 {v4.8b, v5.8b},[x11],x3 154 st2 {v6.8b, v7.8b},[x11],x3 155 st2 {v8.8b, v9.8b},[x11],x3 156 157 st2 {v10.8b, v11.8b},[x11],x3 158 st2 {v12.8b, v13.8b},[x11],x3 159 st2 {v14.8b, v15.8b},[x11],x3 160 st2 {v16.8b, v17.8b},[x11] 161 162 add x1,x1,#16 163 164 subs x9,x9,#8 165 bgt col_loop_chroma 166 167 subs x8,x8,#8 168 169 add x0,x0,x2,lsl #3 170 add x1,x1,x3,lsl #3 171 sub x0,x0,x4,lsl #1 172 sub x1,x1,x4,lsl #1 173 bgt row_loop_chroma 174 b end_loops_chroma 175 176 177 row_loop_4chroma: 178 mov x9,x10 179 180 181 col_loop_4chroma: 182 183 184 ld2 {v2.8b, v3.8b},[x0],x2 185 ld2 {v4.8b, v5.8b},[x0],x2 186 ld2 {v6.8b, v7.8b},[x0],x2 187 ld2 {v8.8b, v9.8b},[x0] 188 189 190 191 192 uaddw v30.8h, v0.8h , v2.8b 193 uaddw v28.8h, v0.8h , v4.8b 194 uaddw v26.8h, v0.8h , v6.8b 195 uaddw v24.8h, v0.8h , v8.8b 196 197 198 199 sqxtun v31.8b, v30.8h 200 sqxtun v29.8b, v28.8h 201 sqxtun v27.8b, v26.8h 202 sqxtun v25.8b, v24.8h 203 204 205 zip1 v2.8b, v31.8b, v3.8b 206 zip1 v4.8b, v29.8b, v5.8b 207 zip1 v6.8b, v27.8b, v7.8b 208 zip1 v8.8b, v25.8b, v9.8b 209 210 st1 {v2.2s},[x1],x3 211 st1 {v4.2s},[x1],x3 212 st1 {v6.2s},[x1],x3 213 st1 {v8.2s},[x1] 214 215 end_loops_chroma: 216 ldp x19, x20,[sp],#16 217 pop_v_regs 218 ret 219 220 221