1 ; Copyright (C) 2009 The Android Open Source Project 2 ; 3 ; Licensed under the Apache License, Version 2.0 (the "License"); 4 ; you may not use this file except in compliance with the License. 5 ; You may obtain a copy of the License at 6 ; 7 ; http://www.apache.org/licenses/LICENSE-2.0 8 ; 9 ; Unless required by applicable law or agreed to in writing, software 10 ; distributed under the License is distributed on an "AS IS" BASIS, 11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 ; See the License for the specific language governing permissions and 13 ; limitations under the License. 14 15 ;------------------------------------------------------------------------------- 16 ;-- 17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function 18 ;-- 19 ;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31 32 ;// h264bsdInterpolateChromaHor register allocation 33 34 ref RN 0 35 ptrA RN 0 36 37 mb RN 1 38 block RN 1 39 40 x0 RN 2 41 count RN 2 42 43 y0 RN 3 44 valX RN 3 45 46 width RN 4 47 48 height RN 5 49 tmp7 RN 5 50 51 chrPW RN 6 52 tmp8 RN 6 53 54 tmp1 RN 7 55 chrPH RN 7 56 57 tmp2 RN 8 58 59 tmp3 RN 9 60 61 tmp4 RN 10 62 63 tmp5 RN 11 64 65 tmp6 RN 12 66 67 c32 RN 14 68 xFrac RN 14 69 70 ;// Function exports and imports 71 72 IMPORT h264bsdFillBlock 73 74 EXPORT h264bsdInterpolateChromaHor 75 76 ;// Function arguments 77 ;// 78 ;// u8 *ref, : 0xc4 79 ;// u8 *predPartChroma, : 0xc8 80 ;// i32 x0, : 0xcc 81 ;// i32 y0, : 0xd0 82 ;// u32 width, : 0xf8 83 ;// u32 height, : 0xfc 84 ;// u32 xFrac, : 0x100 85 ;// u32 chromaPartWidth, : 0x104 86 ;// u32 chromaPartHeight : 0x108 87 88 h264bsdInterpolateChromaHor 89 STMFD sp!, {r0-r11,lr} 90 SUB sp, sp, #0xc4 91 92 LDR chrPW, [sp, #0x104] ;// chromaPartWidth 93 LDR width, [sp, #0xf8] ;// width 94 CMP x0, #0 95 BLT do_fill 96 97 ADD tmp6, x0, chrPW ;// tmp6 = x0+ chromaPartWidth 98 ADD tmp6, tmp6, #1 ;// tmp6 = x0 + chromaPartWidth + 1 99 CMP tmp6, width ;// x0+chromaPartWidth+1 > width 100 BHI do_fill 101 102 CMP y0, #0 103 BLT do_fill 104 LDR chrPH, [sp, #0x108] ;// chromaPartHeight 105 LDR height, [sp, #0xfc] ;// height 106 ADD tmp6, y0, chrPH ;// tmp6 = y0 + chromaPartHeight 107 CMP tmp6, height 108 BLS skip_fill 109 110 do_fill 111 LDR chrPH, [sp, #0x108] ;// chromaPartHeight 112 LDR height, [sp, #0xfc] ;// height 113 ADD tmp8, chrPW, #1 ;// tmp8 = chromaPartWidth+1 114 MOV tmp2, tmp8 ;// tmp2 = chromaPartWidth+1 115 STMIA sp,{width,height,tmp8,chrPH,tmp2} 116 ADD block, sp, #0x1c ;// block 117 BL h264bsdFillBlock 118 119 LDR x0, [sp, #0xcc] 120 LDR y0, [sp, #0xd0] 121 LDR ref, [sp, #0xc4] ;// ref 122 STMIA sp,{width,height,tmp8,chrPH,tmp2} 123 ADD block, sp, #0x1c ;// block 124 MLA ref, height, width, ref ;// ref += width * height; 125 MLA block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1) 126 BL h264bsdFillBlock 127 128 MOV x0, #0 ;// x0 = 0 129 MOV y0, #0 ;// y0 = 0 130 STR x0, [sp, #0xcc] 131 STR y0, [sp, #0xd0] 132 ADD ref, sp, #0x1c ;// ref = block 133 STR ref, [sp, #0xc4] ;// ref 134 135 STR chrPH, [sp, #0xfc] ;// height 136 STR tmp8, [sp, #0xf8] ;// width 137 MOV width, tmp8 138 SUB chrPW, chrPW, #1 139 140 skip_fill 141 MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0 142 LDR xFrac, [sp, #0x100] ;// xFrac 143 ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0 144 RSB valX, xFrac, #8 ;// valX = 8-xFrac 145 146 LDR mb, [sp, #0xc8] ;// predPartChroma 147 148 149 ;// pack values to count register 150 ;// [31:28] loop_x (chromaPartWidth-1) 151 ;// [27:24] loop_y (chromaPartHeight-1) 152 ;// [23:20] chromaPartWidth-1 153 ;// [19:16] chromaPartHeight-1 154 ;// [15:00] nothing 155 156 SUB tmp2, chrPH, #1 ;// chromaPartHeight-1 157 SUB tmp1, chrPW, #1 ;// chromaPartWidth-1 158 ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1 159 ADD count, count, tmp2, LSL #24 ;// loop_y 160 ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1 161 AND tmp2, count, #0x00F00000 ;// loop_x 162 PKHBT valX, valX, xFrac, LSL #16 ;// |xFrac|valX | 163 MOV valX, valX, LSL #3 ;// multiply by 8 in advance 164 MOV c32, #32 165 166 167 ;/////////////////////////////////////////////////////////////////////////// 168 ;// Cb 169 ;/////////////////////////////////////////////////////////////////////////// 170 171 ;// 2x2 pels per iteration 172 ;// bilinear vertical interpolation 173 174 loop1_y 175 ADD count, count, tmp2, LSL #8 176 LDRB tmp1, [ptrA, width] 177 LDRB tmp2, [ptrA], #1 178 179 loop1_x 180 LDRB tmp3, [ptrA, width] 181 LDRB tmp4, [ptrA], #1 182 183 PKHBT tmp5, tmp1, tmp3, LSL #16 184 PKHBT tmp6, tmp2, tmp4, LSL #16 185 186 LDRB tmp1, [ptrA, width] 187 LDRB tmp2, [ptrA], #1 188 189 SMLAD tmp5, tmp5, valX, c32 ;// multiply 190 SMLAD tmp6, tmp6, valX, c32 ;// multiply 191 192 PKHBT tmp7, tmp3, tmp1, LSL #16 193 PKHBT tmp8, tmp4, tmp2, LSL #16 194 195 SMLAD tmp7, tmp7, valX, c32 ;// multiply 196 SMLAD tmp8, tmp8, valX, c32 ;// multiply 197 198 MOV tmp5, tmp5, LSR #6 ;// scale down 199 STRB tmp5, [mb,#8] ;// store row 2 col 1 200 201 MOV tmp6, tmp6, LSR #6 ;// scale down 202 STRB tmp6, [mb],#1 ;// store row 1 col 1 203 204 MOV tmp7, tmp7, LSR #6 ;// scale down 205 STRB tmp7, [mb,#8] ;// store row 2 col 2 206 207 MOV tmp8, tmp8, LSR #6 ;// scale down 208 STRB tmp8, [mb],#1 ;// store row 1 col 2 209 210 SUBS count, count, #2<<28 211 BCS loop1_x 212 213 AND tmp2, count, #0x00F00000 214 215 ADDS mb, mb, #16 216 SBC mb, mb, tmp2, LSR #20 217 ADD ptrA, ptrA, width, LSL #1 218 SBC ptrA, ptrA, tmp2, LSR #20 219 SUB ptrA, ptrA, #1 220 221 ADDS count, count, #0xE << 24 222 BGE loop1_y 223 224 ;/////////////////////////////////////////////////////////////////////////// 225 ;// Cr 226 ;/////////////////////////////////////////////////////////////////////////// 227 LDR height, [sp,#0xfc] ;// height 228 LDR ref, [sp, #0xc4] ;// ref 229 LDR tmp1, [sp, #0xd0] ;// y0 230 LDR tmp2, [sp, #0xcc] ;// x0 231 LDR mb, [sp, #0xc8] ;// predPartChroma 232 233 ADD tmp1, height, tmp1 234 MLA tmp3, tmp1, width, tmp2 235 ADD ptrA, ref, tmp3 236 ADD mb, mb, #64 237 238 AND count, count, #0x00FFFFFF 239 AND tmp1, count, #0x000F0000 240 ADD count, count, tmp1, LSL #8 241 AND tmp2, count, #0x00F00000 242 243 ;// 2x2 pels per iteration 244 ;// bilinear vertical interpolation 245 loop2_y 246 ADD count, count, tmp2, LSL #8 247 LDRB tmp1, [ptrA, width] 248 LDRB tmp2, [ptrA], #1 249 250 loop2_x 251 LDRB tmp3, [ptrA, width] 252 LDRB tmp4, [ptrA], #1 253 254 PKHBT tmp5, tmp1, tmp3, LSL #16 255 PKHBT tmp6, tmp2, tmp4, LSL #16 256 257 LDRB tmp1, [ptrA, width] 258 LDRB tmp2, [ptrA], #1 259 260 SMLAD tmp5, tmp5, valX, c32 ;// multiply 261 SMLAD tmp6, tmp6, valX, c32 ;// multiply 262 263 PKHBT tmp7, tmp3, tmp1, LSL #16 264 PKHBT tmp8, tmp4, tmp2, LSL #16 265 266 SMLAD tmp7, tmp7, valX, c32 ;// multiply 267 SMLAD tmp8, tmp8, valX, c32 ;// multiply 268 269 MOV tmp5, tmp5, LSR #6 ;// scale down 270 STRB tmp5, [mb,#8] ;// store row 2 col 1 271 272 MOV tmp6, tmp6, LSR #6 ;// scale down 273 STRB tmp6, [mb],#1 ;// store row 1 col 1 274 275 MOV tmp7, tmp7, LSR #6 ;// scale down 276 STRB tmp7, [mb,#8] ;// store row 2 col 2 277 278 MOV tmp8, tmp8, LSR #6 ;// scale down 279 STRB tmp8, [mb],#1 ;// store row 1 col 2 280 281 SUBS count, count, #2<<28 282 BCS loop2_x 283 284 AND tmp2, count, #0x00F00000 285 286 ADDS mb, mb, #16 287 SBC mb, mb, tmp2, LSR #20 288 ADD ptrA, ptrA, width, LSL #1 289 SBC ptrA, ptrA, tmp2, LSR #20 290 SUB ptrA, ptrA, #1 291 292 ADDS count, count, #0xE << 24 293 BGE loop2_y 294 295 ADD sp,sp,#0xd4 296 LDMFD sp!, {r4-r11,pc} 297 298 END 299