1 ; Copyright (C) 2009 The Android Open Source Project 2 ; 3 ; Licensed under the Apache License, Version 2.0 (the "License"); 4 ; you may not use this file except in compliance with the License. 5 ; You may obtain a copy of the License at 6 ; 7 ; http://www.apache.org/licenses/LICENSE-2.0 8 ; 9 ; Unless required by applicable law or agreed to in writing, software 10 ; distributed under the License is distributed on an "AS IS" BASIS, 11 ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 ; See the License for the specific language governing permissions and 13 ; limitations under the License. 14 15 ;------------------------------------------------------------------------------- 16 ;-- 17 ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer 18 ;-- function 19 ;-- 20 ;------------------------------------------------------------------------------- 21 22 23 IF :DEF: H264DEC_WINASM 24 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 25 ELSE 26 REQUIRE8 27 PRESERVE8 28 ENDIF 29 30 AREA |.text|, CODE 31 32 33 ;// h264bsdInterpolateChromaHorVer register allocation 34 35 ref RN 0 36 ptrA RN 0 37 38 mb RN 1 39 block RN 1 40 41 x0 RN 2 42 count RN 2 43 44 y0 RN 3 45 valY RN 3 46 47 width RN 4 48 49 tmp4 RN 5 50 height RN 5 51 52 tmp1 RN 6 53 54 tmp2 RN 7 55 56 tmp3 RN 8 57 58 valX RN 9 59 60 tmp5 RN 10 61 chrPW RN 10 62 63 tmp6 RN 11 64 chrPH RN 11 65 66 xFrac RN 12 67 68 c32 RN 14 69 yFrac RN 14 70 71 ;// function exports and imports 72 73 IMPORT h264bsdFillBlock 74 75 EXPORT h264bsdInterpolateChromaHorVer 76 77 ;// Function arguments 78 ;// 79 ;// u8 *ref, : 0xc4 80 ;// u8 *predPartChroma, : 0xc8 81 ;// i32 x0, : 0xcc 82 ;// i32 y0, : 0xd0 83 ;// u32 width, : 0xf8 84 ;// u32 height, : 0xfc 85 ;// u32 xFrac, : 0x100 86 ;// u32 yFrac, : 0x104 87 ;// u32 chromaPartWidth, : 0x108 88 ;// u32 chromaPartHeight : 0x10c 89 90 h264bsdInterpolateChromaHorVer 91 STMFD sp!, {r0-r11,lr} 92 SUB sp, sp, #0xc4 93 94 LDR chrPW, [sp, #0x108] ;// chromaPartWidth 95 LDR xFrac, [sp, #0x100] ;// xFrac 96 LDR width, [sp, #0xf8] ;// width 97 CMP x0, #0 98 BLT do_fill 99 100 ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth 101 ADD tmp1, tmp1, #1 ;// tmp1 = x0+ chromaPartWidth+1 102 CMP tmp1, width ;// x0+chromaPartWidth+1 > width 103 BHI do_fill 104 105 CMP y0, #0 106 BLT do_fill 107 LDR chrPH, [sp, #0x10c] ;// chromaPartHeight 108 LDR height, [sp, #0xfc] ;// height 109 ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight 110 ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1 111 CMP tmp1, height 112 BLS skip_fill 113 114 do_fill 115 LDR chrPH, [sp, #0x10c] ;// chromaPartHeight 116 LDR height, [sp, #0xfc] ;// height 117 ADD tmp3, chrPW, #1 ;// tmp3 = chromaPartWidth+1 118 ADD tmp1, chrPW, #1 ;// tmp1 = chromaPartWidth+1 119 ADD tmp2, chrPH, #1 ;// tmp2 = chromaPartHeight+1 120 STMIA sp,{width,height,tmp1,tmp2,tmp3} 121 ADD block, sp, #0x1c ;// block 122 BL h264bsdFillBlock 123 124 LDR x0, [sp, #0xcc] 125 LDR y0, [sp, #0xd0] 126 LDR ref, [sp, #0xc4] ;// ref 127 STMIA sp,{width,height,tmp1,tmp2,tmp3} 128 ADD block, sp, #0x1c ;// block 129 MLA ref, height, width, ref ;// ref += width * height; 130 MLA block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1) 131 BL h264bsdFillBlock 132 133 MOV x0, #0 ;// x0 = 0 134 MOV y0, #0 ;// y0 = 0 135 STR x0, [sp, #0xcc] 136 STR y0, [sp, #0xd0] 137 ADD ref, sp, #0x1c ;// ref = block 138 STR ref, [sp, #0xc4] ;// ref 139 140 STR tmp2, [sp, #0xfc] ;// height 141 STR tmp1, [sp, #0xf8] ;// width 142 MOV width, tmp1 143 144 skip_fill 145 MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0 146 LDR yFrac, [sp, #0x104] ;// yFrac 147 LDR xFrac, [sp, #0x100] 148 ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0 149 RSB valX, xFrac, #8 ;// valX = 8-xFrac 150 RSB valY, yFrac, #8 ;// valY = 8-yFrac 151 152 LDR mb, [sp, #0xc8] ;// predPartChroma 153 154 155 ;// pack values to count register 156 ;// [31:28] loop_x (chromaPartWidth-1) 157 ;// [27:24] loop_y (chromaPartHeight-1) 158 ;// [23:20] chromaPartWidth-1 159 ;// [19:16] chromaPartHeight-1 160 ;// [15:00] nothing 161 162 SUB tmp2, chrPH, #1 ;// chromaPartHeight-1 163 SUB tmp1, chrPW, #1 ;// chromaPartWidth-1 164 ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1 165 ADD count, count, tmp2, LSL #24 ;// loop_y 166 ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1 167 AND tmp2, count, #0x00F00000 ;// loop_x 168 PKHBT valY, valY, yFrac, LSL #16 ;// |yFrac|valY | 169 MOV c32, #32 170 171 172 ;/////////////////////////////////////////////////////////////////////////// 173 ;// Cb 174 ;/////////////////////////////////////////////////////////////////////////// 175 176 ;// 2x2 pels per iteration 177 ;// bilinear vertical and horizontal interpolation 178 179 loop1_y 180 LDRB tmp1, [ptrA] 181 LDRB tmp3, [ptrA, width] 182 LDRB tmp5, [ptrA, width, LSL #1] 183 184 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 185 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 186 187 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 188 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 189 190 ADD count, count, tmp2, LSL #8 191 loop1_x 192 ;// first 193 LDRB tmp2, [ptrA, #1]! 194 LDRB tmp4, [ptrA, width] 195 LDRB tmp6, [ptrA, width, LSL #1] 196 197 PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2| 198 PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4| 199 200 SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac) 201 MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32 202 MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5 203 204 SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac) 205 MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32 206 MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6 207 208 MOV tmp6, tmp6, LSR #6 ;// scale down 209 STRB tmp6, [mb, #8] ;// store pixel 210 MOV tmp5, tmp5, LSR #6 ;// scale down 211 STRB tmp5, [mb], #1 ;// store pixel 212 213 ;// second 214 LDRB tmp1, [ptrA, #1]! 215 LDRB tmp3, [ptrA, width] 216 LDRB tmp5, [ptrA, width, LSL #1] 217 218 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 219 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 220 221 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 222 MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32 223 MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5 224 225 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 226 MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32 227 MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6 228 229 MOV tmp6, tmp6, LSR #6 ;// scale down 230 STRB tmp6, [mb, #8] ;// store pixel 231 MOV tmp5, tmp5, LSR #6 ;// scale down 232 STRB tmp5, [mb], #1 ;// store pixel 233 234 SUBS count, count, #2<<28 235 BCS loop1_x 236 237 AND tmp2, count, #0x00F00000 238 239 ADDS mb, mb, #16 240 SBC mb, mb, tmp2, LSR #20 241 ADD ptrA, ptrA, width, LSL #1 242 SBC ptrA, ptrA, tmp2, LSR #20 243 244 ADDS count, count, #0xE << 24 245 BGE loop1_y 246 247 ;/////////////////////////////////////////////////////////////////////////// 248 ;// Cr 249 ;/////////////////////////////////////////////////////////////////////////// 250 LDR height, [sp,#0xfc] ;// height 251 LDR ref, [sp, #0xc4] ;// ref 252 LDR tmp1, [sp, #0xd0] ;// y0 253 LDR tmp2, [sp, #0xcc] ;// x0 254 LDR mb, [sp, #0xc8] ;// predPartChroma 255 256 ADD tmp1, height, tmp1 257 MLA tmp3, tmp1, width, tmp2 258 ADD ptrA, ref, tmp3 259 ADD mb, mb, #64 260 261 AND count, count, #0x00FFFFFF 262 AND tmp1, count, #0x000F0000 263 ADD count, count, tmp1, LSL #8 264 AND tmp2, count, #0x00F00000 265 266 ;// 2x2 pels per iteration 267 ;// bilinear vertical and horizontal interpolation 268 loop2_y 269 LDRB tmp1, [ptrA] 270 LDRB tmp3, [ptrA, width] 271 LDRB tmp5, [ptrA, width, LSL #1] 272 273 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 274 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 275 276 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 277 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 278 279 ADD count, count, tmp2, LSL #8 280 loop2_x 281 ;// first 282 LDRB tmp2, [ptrA, #1]! 283 LDRB tmp4, [ptrA, width] 284 LDRB tmp6, [ptrA, width, LSL #1] 285 286 PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2| 287 PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4| 288 289 SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac) 290 MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32 291 MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5 292 293 SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac) 294 MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32 295 MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6 296 297 MOV tmp6, tmp6, LSR #6 ;// scale down 298 STRB tmp6, [mb, #8] ;// store pixel 299 MOV tmp5, tmp5, LSR #6 ;// scale down 300 STRB tmp5, [mb], #1 ;// store pixel 301 302 ;// second 303 LDRB tmp1, [ptrA, #1]! 304 LDRB tmp3, [ptrA, width] 305 LDRB tmp5, [ptrA, width, LSL #1] 306 307 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 308 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 309 310 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 311 MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32 312 MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5 313 314 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 315 MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32 316 MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6 317 318 MOV tmp6, tmp6, LSR #6 ;// scale down 319 STRB tmp6, [mb, #8] ;// store pixel 320 MOV tmp5, tmp5, LSR #6 ;// scale down 321 STRB tmp5, [mb], #1 ;// store pixel 322 323 SUBS count, count, #2<<28 324 BCS loop2_x 325 326 AND tmp2, count, #0x00F00000 327 328 ADDS mb, mb, #16 329 SBC mb, mb, tmp2, LSR #20 330 ADD ptrA, ptrA, width, LSL #1 331 SBC ptrA, ptrA, tmp2, LSR #20 332 333 ADDS count, count, #0xE << 24 334 BGE loop2_y 335 336 ADD sp,sp,#0xd4 337 LDMFD sp!,{r4-r11,pc} 338 339 END 340