1 ; Copyright (C) 2009 The Android Open Source Project 2 ; 3 ; Licensed under the Apache License, Version 2.0 (the "License"); 4 ; you may not use this file except in compliance with the License. 5 ; You may obtain a copy of the License at 6 ; 7 ; http://www.apache.org/licenses/LICENSE-2.0 8 ; 9 ; Unless required by applicable law or agreed to in writing, software 10 ; distributed under the License is distributed on an "AS IS" BASIS, 11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 ; See the License for the specific language governing permissions and 13 ; limitations under the License. 14 15 ;------------------------------------------------------------------------------- 16 ;-- 17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function 18 ;-- 19 ;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31 ;// h264bsdInterpolateHorHalf register allocation 32 33 ref RN 0 34 35 mb RN 1 36 buff RN 1 37 38 count RN 2 39 x0 RN 2 40 41 y0 RN 3 42 x_2_0 RN 3 43 44 width RN 4 45 x_3_1 RN 4 46 47 height RN 5 48 x_6_4 RN 5 49 50 partW RN 6 51 x_7_5 RN 6 52 53 partH RN 7 54 tmp1 RN 7 55 56 tmp2 RN 8 57 58 tmp3 RN 9 59 60 tmp4 RN 10 61 62 mult_20_01 RN 11 63 mult_20_m5 RN 12 64 65 plus16 RN 14 66 67 68 ;// function exports and imports 69 70 IMPORT h264bsdFillBlock 71 72 EXPORT h264bsdInterpolateHorHalf 73 74 ;// Horizontal filter approach 75 ;// 76 ;// Basic idea in horizontal filtering is to adjust coefficients 77 ;// like below. Calculation is done with 16-bit maths. 78 ;// 79 ;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 80 ;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... 81 ;// y_0 = 20 1 20 -5 -5 1 82 ;// y_1 = -5 20 1 1 20 -5 83 ;// y_2 = 1 -5 -5 20 1 20 84 ;// y_3 = 1 20 -5 -5 20 1 85 86 87 h264bsdInterpolateHorHalf 88 STMFD sp!, {r0-r11, lr} 89 SUB sp, sp, #0x1e4 90 91 CMP x0, #0 92 BLT do_fill ;// (x0 < 0) 93 LDR partW, [sp,#0x220] ;// partWidth 94 ADD tmp4, x0, partW ;// (x0+partWidth) 95 ADD tmp4, tmp4, #5 ;// (y0+partW+5) 96 LDR width, [sp,#0x218] ;// width 97 CMP tmp4, width 98 BHI do_fill ;// (x0+partW)>width 99 100 CMP y0, #0 101 BLT do_fill ;// (y0 < 0) 102 LDR partH, [sp,#0x224] ;// partHeight 103 ADD tmp2, y0, partH ;// (y0+partHeight) 104 LDR height, [sp,#0x21c] ;// height 105 CMP tmp2, height 106 BLS skip_fill ;// no overfill needed 107 108 109 do_fill 110 LDR partH, [sp,#0x224] ;// partHeight 111 LDR height, [sp,#0x21c] ;// height 112 LDR partW, [sp,#0x220] ;// partWidth 113 ADD tmp4, partW, #5 ;// tmp4 = partW + 5; 114 STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5 115 STR partH, [sp,#0xc] ;// sp+c = partHeight 116 STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5 117 LDR width, [sp,#0x218] ;// width 118 STR width, [sp,#0] ;// sp+0 = width 119 ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 120 BL h264bsdFillBlock 121 122 MOV x0, #0 123 STR x0,[sp,#0x1ec] ;// x0 = 0 124 STR x0,[sp,#0x1f0] ;// y0 = 0 125 ADD ref,sp,#0x28 ;// ref = p1 126 STR tmp4, [sp,#0x218] ;// width = partWidth+5 127 128 129 skip_fill 130 LDR x0 ,[sp,#0x1ec] ;// x0 131 LDR y0 ,[sp,#0x1f0] ;// y0 132 LDR width, [sp,#0x218] ;// width 133 MLA tmp2, width, y0, x0 ;// y0*width+x0 134 ADD ref, ref, tmp2 ;// ref += y0*width+x0 135 ADD ref, ref, #8 ;// ref = ref+8 136 LDR mb, [sp, #0x1e8] ;// mb 137 138 ;// pack values to count register 139 ;// [31:28] loop_x (partWidth-1) 140 ;// [27:24] loop_y (partHeight-1) 141 ;// [23:20] partWidth-1 142 ;// [19:16] partHeight-1 143 ;// [15:00] width 144 MOV count, width 145 SUB partW, partW, #1; 146 SUB partH, partH, #1; 147 ADD tmp2, partH, partW, LSL #4 148 ADD count, count, tmp2, LSL #16 149 150 151 LDR mult_20_01, = 0x00140001 152 LDR mult_20_m5, = 0x0014FFFB 153 MOV plus16, #16 154 AND tmp1, count, #0x000F0000 ;// partHeight-1 155 AND tmp3, count, #0x00F00000 ;// partWidth-1 156 ADD count, count, tmp1, LSL #8 157 loop_y 158 LDR x_3_1, [ref, #-8] 159 ADD count, count, tmp3, LSL #8 160 LDR x_7_5, [ref, #-4] 161 UXTB16 x_2_0, x_3_1 162 UXTB16 x_3_1, x_3_1, ROR #8 163 UXTB16 x_6_4, x_7_5 164 165 loop_x 166 UXTB16 x_7_5, x_7_5, ROR #8 167 168 SMLAD tmp1, x_2_0, mult_20_01, plus16 169 SMLATB tmp3, x_2_0, mult_20_01, plus16 170 SMLATB tmp2, x_2_0, mult_20_m5, plus16 171 SMLATB tmp4, x_3_1, mult_20_01, plus16 172 173 SMLAD tmp1, x_3_1, mult_20_m5, tmp1 174 SMLATB tmp3, x_3_1, mult_20_m5, tmp3 175 SMLAD tmp2, x_3_1, mult_20_01, tmp2 176 LDR x_3_1, [ref], #4 177 SMLAD tmp4, x_6_4, mult_20_m5, tmp4 178 179 SMLABB tmp1, x_6_4, mult_20_m5, tmp1 180 SMLADX tmp3, x_6_4, mult_20_m5, tmp3 181 SMLADX tmp2, x_6_4, mult_20_01, tmp2 182 SMLADX tmp4, x_7_5, mult_20_m5, tmp4 183 184 SMLABB tmp1, x_7_5, mult_20_01, tmp1 185 UXTB16 x_2_0, x_3_1 186 SMLABB tmp2, x_7_5, mult_20_m5, tmp2 187 SMLADX tmp3, x_7_5, mult_20_01, tmp3 188 SMLABB tmp4, x_2_0, mult_20_01, tmp4 189 190 MOV tmp2, tmp2, ASR #5 191 MOV tmp1, tmp1, ASR #5 192 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) 193 PKHBT tmp1, tmp1, tmp3, LSL #(16-5) 194 USAT16 tmp2, #8, tmp2 195 USAT16 tmp1, #8, tmp1 196 197 SUBS count, count, #4<<28 198 ORR tmp1, tmp1, tmp2, LSL #8 199 STR tmp1, [mb], #4 200 BCC next_y 201 202 UXTB16 x_3_1, x_3_1, ROR #8 203 204 SMLAD tmp1, x_6_4, mult_20_01, plus16 205 SMLATB tmp3, x_6_4, mult_20_01, plus16 206 SMLATB tmp2, x_6_4, mult_20_m5, plus16 207 SMLATB tmp4, x_7_5, mult_20_01, plus16 208 209 SMLAD tmp1, x_7_5, mult_20_m5, tmp1 210 SMLATB tmp3, x_7_5, mult_20_m5, tmp3 211 SMLAD tmp2, x_7_5, mult_20_01, tmp2 212 LDR x_7_5, [ref], #4 213 SMLAD tmp4, x_2_0, mult_20_m5, tmp4 214 215 SMLABB tmp1, x_2_0, mult_20_m5, tmp1 216 SMLADX tmp3, x_2_0, mult_20_m5, tmp3 217 SMLADX tmp2, x_2_0, mult_20_01, tmp2 218 SMLADX tmp4, x_3_1, mult_20_m5, tmp4 219 220 SMLABB tmp1, x_3_1, mult_20_01, tmp1 221 UXTB16 x_6_4, x_7_5 222 SMLABB tmp2, x_3_1, mult_20_m5, tmp2 223 SMLADX tmp3, x_3_1, mult_20_01, tmp3 224 SMLABB tmp4, x_6_4, mult_20_01, tmp4 225 226 MOV tmp2, tmp2, ASR #5 227 MOV tmp1, tmp1, ASR #5 228 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) 229 PKHBT tmp1, tmp1, tmp3, LSL #(16-5) 230 USAT16 tmp2, #8, tmp2 231 USAT16 tmp1, #8, tmp1 232 233 SUBS count, count, #4<<28 234 ORR tmp1, tmp1, tmp2, LSL #8 235 STR tmp1, [mb], #4 236 BCS loop_x 237 238 next_y 239 AND tmp3, count, #0x00F00000 ;// partWidth-1 240 SMLABB ref, count, mult_20_01, ref ;// +width 241 ADDS mb, mb, #16 ;// +16, Carry=0 242 SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1 243 SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1 244 ADDS count, count, #(1<<28)-(1<<24) 245 BGE loop_y 246 247 ADD sp,sp,#0x1f4 248 LDMFD sp!, {r4-r11, pc} 249 250 END 251 252