1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 ; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 13 EXPORT |vp8_short_idct4x4llm_1_v6| 14 EXPORT |vp8_short_idct4x4llm_v6| 15 EXPORT |vp8_short_idct4x4llm_v6_scott| 16 EXPORT |vp8_short_idct4x4llm_v6_dual| 17 18 AREA |.text|, CODE, READONLY 19 20 ;******************************************************************************** 21 ;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) 22 ;* r0 INT16 * input 23 ;* r1 INT16 * output 24 ;* r2 INT32 pitch 25 ;* bench: 3/5 26 ;******************************************************************************** 27 28 |vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit 29 ; 30 ldrsh r0, [r0] ; load input[0] 1, r0 un 2 31 add r0, r0, #4 ; 1 +4 32 stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup 33 mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 34 pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack 35 mov r5, r4 ; expand expand 36 37 strd r4, [r1], r2 ; *output = r0, post inc 1 38 strd r4, [r1], r2 ; 1 39 strd r4, [r1], r2 ; 1 40 strd r4, [r1] ; 1 41 ; 42 ldmia sp!, {r4, r5, pc} ; replace vars, return restore 43 ENDP ; |vp8_short_idct4x4llm_1_v6| 44 ;******************************************************************************** 45 ;******************************************************************************** 46 ;******************************************************************************** 47 48 ;******************************************************************************** 49 ;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) 50 ;* r0 INT16 * input 51 ;* r1 INT16 * output 52 ;* r2 INT32 pitch 53 ;* bench: 54 ;******************************************************************************** 55 56 |vp8_short_idct4x4llm_v6| PROC ; cycles in out pit 57 ; 58 stmdb sp!, {r4-r11, lr} ; backup registers 1 backup 59 ; 60 mov r4, #0x00004E00 ; 1 cst 61 orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 62 mov r5, #0x00008A00 ; 1 cst 63 orr r5, r5, #0x0000008C ; sinpi8sqrt2 64 ; 65 mov r6, #4 ; i=4 1 i 66 loop1 ; 67 ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] 68 ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] 69 ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] 70 ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] 71 smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 72 smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 73 add r9, r7, r8 ; a1 = [0] + [8] 1 a1 74 sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 75 add r11, r3, r11 ; temp2 1 76 rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 77 smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 78 smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 79 add r8, r7, r11 ; b1 + c1 1 b+c 80 strh r8, [r1, r2] ; out[pitch] = b1+c1 1 81 sub r7, r7, r11 ; b1 - c1 1 b-c 82 add r10, r12, r10 ; temp1 1 83 add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 84 add r10, r9, r3 ; a1 + d1 1 a+d 85 sub r3, r9, r3 ; a1 - d1 1 a-d 86 add r8, r2, r2 ; pitch * 2 1 p*2 87 strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 88 add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 89 strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 90 subs r6, r6, #1 ; i-- 1 -- 91 strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ 92 bne loop1 ; if i>0, continue 93 ; 94 sub r1, r1, #8 ; set up out for next loop 1 -4 95 ; for this iteration, input=prev output 96 mov r6, #4 ; i=4 1 i 97 ; b returnfull 98 loop2 ; 99 ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] 100 ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] 101 ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] 102 ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] 103 smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 104 smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 105 add r7, r0, r3 ; a1 = [0] + [2] 1 a1 106 sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 107 add r10, r8, r10 ; temp2 1 108 rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 109 smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 110 smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 111 add r3, r0, r9 ; b1+c1 1 b+c 112 add r3, r3, #4 ; b1+c1+4 1 +4 113 add r10, r11, r10 ; temp1 1 114 mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 115 strh r3, [r1, #2] ; out[1] = b1+c1 1 116 add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 117 add r3, r7, r10 ; a1+d1 1 a+d 118 add r3, r3, #4 ; a1+d1+4 1 +4 119 sub r7, r7, r10 ; a1-d1 1 a-d 120 add r7, r7, #4 ; a1-d1+4 1 +4 121 mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 122 mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 123 strh r7, [r1, #6] ; out[3] = a1-d1 1 124 sub r0, r0, r9 ; b1-c1 1 b-c 125 add r0, r0, #4 ; b1-c1+4 1 +4 126 subs r6, r6, #1 ; i-- 1 -- 127 mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 128 strh r0, [r1, #4] ; out[2] = b1-c1 1 129 strh r3, [r1], r2 ; out[0] = a1+d1 1 130 ; add r1, r1, r2 ; out += pitch 1 ++ 131 bne loop2 ; if i>0, continue 132 returnfull ; 133 ldmia sp!, {r4 - r11, pc} ; replace vars, return restore 134 ENDP 135 136 ;******************************************************************************** 137 ;******************************************************************************** 138 ;******************************************************************************** 139 140 ;******************************************************************************** 141 ;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) 142 ;* r0 INT16 * input 143 ;* r1 INT16 * output 144 ;* r2 INT32 pitch 145 ;* bench: 146 ;******************************************************************************** 147 148 |vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit 149 ; mov r0, #0 ; 150 ; ldr r0, [r0] ; 151 stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup 152 ; 153 mov r3, #0x00004E00 ; cos 154 orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 155 mov r4, #0x00008A00 ; sin 156 orr r4, r4, #0x0000008C ; sinpi8sqrt2 157 ; 158 mov r5, #0x2 ; i i 159 ; 160 short_idct4x4llm_v6_scott_loop1 ; 161 ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 162 ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 163 ; 164 smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 165 smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 166 ; 167 smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 168 smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 169 ; 170 add r6, r6, r7 ; partial c1 lt1-lt2 171 add r12, r12, r14 ; partial d1 l2t2+l2t1 172 ; 173 smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 174 smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 175 ; 176 smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 177 smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 178 ; 179 add r7, r14, r7 ; partial c1_2 ht1+ht2 180 sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 181 ; 182 pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack 183 pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack 184 ; 185 usub16 r6, r6, r10 ; c1_2 | c1_1 c 186 uadd16 r12, r12, r11 ; d1_2 | d1_1 d 187 ; 188 ldr r10, [r0, #0] ; i1 | i0 1,0 189 ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 190 ; 191 ;;;;;; add r0, r0, #0x4 ; +4 192 ;;;;;; add r1, r1, #0x4 ; +4 193 ; 194 uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a 195 usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b 196 ; 197 uadd16 r7, r8, r12 ; a1 + d1 pair a+d 198 usub16 r14, r8, r12 ; a1 - d1 pair a-d 199 ; 200 str r7, [r1] ; op[0] = a1 + d1 201 str r14, [r1, r2] ; op[pitch*3] = a1 - d1 202 ; 203 add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ 204 add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ 205 ; 206 subs r5, r5, #0x1 ; -- 207 bne short_idct4x4llm_v6_scott_loop1 ; 208 ; 209 sub r1, r1, #16 ; reset output ptr 210 mov r5, #0x4 ; 211 mov r0, r1 ; input = output 212 ; 213 short_idct4x4llm_v6_scott_loop2 ; 214 ; 215 subs r5, r5, #0x1 ; 216 bne short_idct4x4llm_v6_scott_loop2 ; 217 ; 218 ldmia sp!, {r4 - r11, pc} ; 219 ENDP ; 220 ; 221 ;******************************************************************************** 222 ;******************************************************************************** 223 ;******************************************************************************** 224 225 ;******************************************************************************** 226 ;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) 227 ;* r0 INT16 * input 228 ;* r1 INT16 * output 229 ;* r2 INT32 pitch 230 ;* bench: 231 ;******************************************************************************** 232 233 |vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit 234 ; 235 stmdb sp!, {r4-r11, lr} ; backup registers 1 backup 236 mov r3, #0x00004E00 ; cos 237 orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 238 mov r4, #0x00008A00 ; sin 239 orr r4, r4, #0x0000008C ; sinpi8sqrt2 240 mov r5, #0x2 ; i=2 i 241 loop1_dual 242 ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 243 ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 244 ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 245 246 smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c 247 smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c 248 smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s 249 smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s 250 pkhbt r7, r7, r9, lsl #16 ; 5c | 4c 251 smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c 252 pkhbt r8, r8, r10, lsl #16 ; 5s | 4s 253 uadd16 r6, r6, r7 ; 5c+5 | 4c+4 254 smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s 255 smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c 256 smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s 257 subs r5, r5, #0x1 ; i-- -- 258 pkhbt r9, r9, r11, lsl #16 ; 13c | 12c 259 ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 260 pkhbt r10, r10, r7, lsl #16 ; 13s | 12s 261 uadd16 r7, r12, r9 ; 13c+13 | 12c+12 262 usub16 r7, r8, r7 ; c c 263 uadd16 r6, r6, r10 ; d d 264 uadd16 r10, r11, r14 ; a a 265 usub16 r8, r11, r14 ; b b 266 uadd16 r9, r10, r6 ; a+d a+d 267 usub16 r10, r10, r6 ; a-d a-d 268 uadd16 r6, r8, r7 ; b+c b+c 269 usub16 r7, r8, r7 ; b-c b-c 270 str r6, [r1, r2] ; o5 | o4 271 add r6, r2, r2 ; pitch * 2 p2 272 str r7, [r1, r6] ; o9 | o8 273 add r6, r6, r2 ; pitch * 3 p3 274 str r10, [r1, r6] ; o13 | o12 275 str r9, [r1], #0x4 ; o1 | o0 ++ 276 bne loop1_dual ; 277 mov r5, #0x2 ; i=2 i 278 sub r0, r1, #8 ; reset input/output i/o 279 loop2_dual 280 ldr r6, [r0, r2] ; i5 | i4 5|4 281 ldr r1, [r0] ; i1 | i0 1|0 282 ldr r12, [r0, #0x4] ; i3 | i2 3|2 283 add r14, r2, #0x4 ; pitch + 2 p+2 284 ldr r14, [r0, r14] ; i7 | i6 7|6 285 smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c 286 smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c 287 smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s 288 smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s 289 pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 290 pkhbt r7, r9, r7, lsl #16 ; 1c | 5c 291 pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 tc1 292 pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 293 uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 294 pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 295 uadd16 r10, r11, r9 ; a a 296 usub16 r9, r11, r9 ; b b 297 pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 298 subs r5, r5, #0x1 ; i-- -- 299 smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c 300 smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s 301 smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c 302 smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s 303 304 pkhbt r7, r12, r7, lsl #16 ; 3c | 7c 305 pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 306 uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 307 usub16 r12, r8, r6 ; c (o1 | o5) c 308 uadd16 r6, r11, r1 ; d (o3 | o7) d 309 uadd16 r7, r10, r6 ; a+d a+d 310 mov r8, #0x4 ; set up 4's 4 311 orr r8, r8, #0x40000 ; 4|4 312 usub16 r6, r10, r6 ; a-d a-d 313 uadd16 r6, r6, r8 ; a-d+4 3|7 314 uadd16 r7, r7, r8 ; a+d+4 0|4 315 uadd16 r10, r9, r12 ; b+c b+c 316 usub16 r1, r9, r12 ; b-c b-c 317 uadd16 r10, r10, r8 ; b+c+4 1|5 318 uadd16 r1, r1, r8 ; b-c+4 2|6 319 mov r8, r10, asr #19 ; o1 >> 3 320 strh r8, [r0, #2] ; o1 321 mov r8, r1, asr #19 ; o2 >> 3 322 strh r8, [r0, #4] ; o2 323 mov r8, r6, asr #19 ; o3 >> 3 324 strh r8, [r0, #6] ; o3 325 mov r8, r7, asr #19 ; o0 >> 3 326 strh r8, [r0], r2 ; o0 +p 327 sxth r10, r10 ; 328 mov r8, r10, asr #3 ; o5 >> 3 329 strh r8, [r0, #2] ; o5 330 sxth r1, r1 ; 331 mov r8, r1, asr #3 ; o6 >> 3 332 strh r8, [r0, #4] ; o6 333 sxth r6, r6 ; 334 mov r8, r6, asr #3 ; o7 >> 3 335 strh r8, [r0, #6] ; o7 336 sxth r7, r7 ; 337 mov r8, r7, asr #3 ; o4 >> 3 338 strh r8, [r0], r2 ; o4 +p 339 ;;;;; subs r5, r5, #0x1 ; i-- -- 340 bne loop2_dual ; 341 ; 342 ldmia sp!, {r4 - r11, pc} ; replace vars, return restore 343 ENDP 344 345 END 346