1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/rotate_row.h" 12 #include "libyuv/row.h" 13 14 #include "libyuv/basic_types.h" 15 16 #ifdef __cplusplus 17 namespace libyuv { 18 extern "C" { 19 #endif 20 21 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \ 22 (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) 23 24 void TransposeWx8_DSPR2(const uint8* src, 25 int src_stride, 26 uint8* dst, 27 int dst_stride, 28 int width) { 29 __asm__ __volatile__( 30 ".set push \n" 31 ".set noreorder \n" 32 "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 33 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 34 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 35 "addu $t3, $t2, %[src_stride] \n" 36 "addu $t5, $t4, %[src_stride] \n" 37 "addu $t6, $t2, $t4 \n" 38 "andi $t0, %[dst], 0x3 \n" 39 "andi $t1, %[dst_stride], 0x3 \n" 40 "or $t0, $t0, $t1 \n" 41 "bnez $t0, 11f \n" 42 " subu $t7, $t9, %[src_stride] \n" 43 // dst + dst_stride word aligned 44 "1: \n" 45 "lbu $t0, 0(%[src]) \n" 46 "lbux $t1, %[src_stride](%[src]) \n" 47 "lbux $t8, $t2(%[src]) \n" 48 "lbux $t9, $t3(%[src]) \n" 49 "sll $t1, $t1, 16 \n" 50 "sll $t9, $t9, 16 \n" 51 "or $t0, $t0, $t1 \n" 52 "or $t8, $t8, $t9 \n" 53 "precr.qb.ph $s0, $t8, $t0 \n" 54 "lbux $t0, $t4(%[src]) \n" 55 "lbux $t1, $t5(%[src]) \n" 56 "lbux $t8, $t6(%[src]) \n" 57 "lbux $t9, $t7(%[src]) \n" 58 "sll $t1, $t1, 16 \n" 59 "sll $t9, $t9, 16 \n" 60 "or $t0, $t0, $t1 \n" 61 "or $t8, $t8, $t9 \n" 62 "precr.qb.ph $s1, $t8, $t0 \n" 63 "sw $s0, 0(%[dst]) \n" 64 "addiu %[width], -1 \n" 65 "addiu %[src], 1 \n" 66 "sw $s1, 4(%[dst]) \n" 67 "bnez %[width], 1b \n" 68 " addu %[dst], %[dst], %[dst_stride] \n" 69 "b 2f \n" 70 // dst + dst_stride unaligned 71 "11: \n" 72 "lbu $t0, 0(%[src]) \n" 73 "lbux $t1, %[src_stride](%[src]) \n" 74 "lbux $t8, $t2(%[src]) \n" 75 "lbux $t9, $t3(%[src]) \n" 76 "sll $t1, $t1, 16 \n" 77 "sll $t9, $t9, 16 \n" 78 "or $t0, $t0, $t1 \n" 79 "or $t8, $t8, $t9 \n" 80 "precr.qb.ph $s0, $t8, $t0 \n" 81 "lbux $t0, $t4(%[src]) \n" 82 "lbux $t1, $t5(%[src]) \n" 83 "lbux $t8, $t6(%[src]) \n" 84 "lbux $t9, $t7(%[src]) \n" 85 "sll $t1, $t1, 16 \n" 86 "sll $t9, $t9, 16 \n" 87 "or $t0, $t0, $t1 \n" 88 "or $t8, $t8, $t9 \n" 89 "precr.qb.ph $s1, $t8, $t0 \n" 90 "swr $s0, 0(%[dst]) \n" 91 "swl $s0, 3(%[dst]) \n" 92 "addiu %[width], -1 \n" 93 "addiu %[src], 1 \n" 94 "swr $s1, 4(%[dst]) \n" 95 "swl $s1, 7(%[dst]) \n" 96 "bnez %[width], 11b \n" 97 "addu %[dst], %[dst], %[dst_stride] \n" 98 "2: \n" 99 ".set pop \n" 100 : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width) 101 : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) 102 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1"); 103 } 104 105 void TransposeWx8_Fast_DSPR2(const uint8* src, 106 int src_stride, 107 uint8* dst, 108 int dst_stride, 109 int width) { 110 __asm__ __volatile__( 111 ".set noat \n" 112 ".set push \n" 113 ".set noreorder \n" 114 "beqz %[width], 2f \n" 115 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 116 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 117 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 118 "addu $t3, $t2, %[src_stride] \n" 119 "addu $t5, $t4, %[src_stride] \n" 120 "addu $t6, $t2, $t4 \n" 121 122 "srl $AT, %[width], 0x2 \n" 123 "andi $t0, %[dst], 0x3 \n" 124 "andi $t1, %[dst_stride], 0x3 \n" 125 "or $t0, $t0, $t1 \n" 126 "bnez $t0, 11f \n" 127 " subu $t7, $t9, %[src_stride] \n" 128 // dst + dst_stride word aligned 129 "1: \n" 130 "lw $t0, 0(%[src]) \n" 131 "lwx $t1, %[src_stride](%[src]) \n" 132 "lwx $t8, $t2(%[src]) \n" 133 "lwx $t9, $t3(%[src]) \n" 134 135 // t0 = | 30 | 20 | 10 | 00 | 136 // t1 = | 31 | 21 | 11 | 01 | 137 // t8 = | 32 | 22 | 12 | 02 | 138 // t9 = | 33 | 23 | 13 | 03 | 139 140 "precr.qb.ph $s0, $t1, $t0 \n" 141 "precr.qb.ph $s1, $t9, $t8 \n" 142 "precrq.qb.ph $s2, $t1, $t0 \n" 143 "precrq.qb.ph $s3, $t9, $t8 \n" 144 145 // s0 = | 21 | 01 | 20 | 00 | 146 // s1 = | 23 | 03 | 22 | 02 | 147 // s2 = | 31 | 11 | 30 | 10 | 148 // s3 = | 33 | 13 | 32 | 12 | 149 150 "precr.qb.ph $s4, $s1, $s0 \n" 151 "precrq.qb.ph $s5, $s1, $s0 \n" 152 "precr.qb.ph $s6, $s3, $s2 \n" 153 "precrq.qb.ph $s7, $s3, $s2 \n" 154 155 // s4 = | 03 | 02 | 01 | 00 | 156 // s5 = | 23 | 22 | 21 | 20 | 157 // s6 = | 13 | 12 | 11 | 10 | 158 // s7 = | 33 | 32 | 31 | 30 | 159 160 "lwx $t0, $t4(%[src]) \n" 161 "lwx $t1, $t5(%[src]) \n" 162 "lwx $t8, $t6(%[src]) \n" 163 "lwx $t9, $t7(%[src]) \n" 164 165 // t0 = | 34 | 24 | 14 | 04 | 166 // t1 = | 35 | 25 | 15 | 05 | 167 // t8 = | 36 | 26 | 16 | 06 | 168 // t9 = | 37 | 27 | 17 | 07 | 169 170 "precr.qb.ph $s0, $t1, $t0 \n" 171 "precr.qb.ph $s1, $t9, $t8 \n" 172 "precrq.qb.ph $s2, $t1, $t0 \n" 173 "precrq.qb.ph $s3, $t9, $t8 \n" 174 175 // s0 = | 25 | 05 | 24 | 04 | 176 // s1 = | 27 | 07 | 26 | 06 | 177 // s2 = | 35 | 15 | 34 | 14 | 178 // s3 = | 37 | 17 | 36 | 16 | 179 180 "precr.qb.ph $t0, $s1, $s0 \n" 181 "precrq.qb.ph $t1, $s1, $s0 \n" 182 "precr.qb.ph $t8, $s3, $s2 \n" 183 "precrq.qb.ph $t9, $s3, $s2 \n" 184 185 // t0 = | 07 | 06 | 05 | 04 | 186 // t1 = | 27 | 26 | 25 | 24 | 187 // t8 = | 17 | 16 | 15 | 14 | 188 // t9 = | 37 | 36 | 35 | 34 | 189 190 "addu $s0, %[dst], %[dst_stride] \n" 191 "addu $s1, $s0, %[dst_stride] \n" 192 "addu $s2, $s1, %[dst_stride] \n" 193 194 "sw $s4, 0(%[dst]) \n" 195 "sw $t0, 4(%[dst]) \n" 196 "sw $s6, 0($s0) \n" 197 "sw $t8, 4($s0) \n" 198 "sw $s5, 0($s1) \n" 199 "sw $t1, 4($s1) \n" 200 "sw $s7, 0($s2) \n" 201 "sw $t9, 4($s2) \n" 202 203 "addiu $AT, -1 \n" 204 "addiu %[src], 4 \n" 205 206 "bnez $AT, 1b \n" 207 " addu %[dst], $s2, %[dst_stride] \n" 208 "b 2f \n" 209 // dst + dst_stride unaligned 210 "11: \n" 211 "lw $t0, 0(%[src]) \n" 212 "lwx $t1, %[src_stride](%[src]) \n" 213 "lwx $t8, $t2(%[src]) \n" 214 "lwx $t9, $t3(%[src]) \n" 215 216 // t0 = | 30 | 20 | 10 | 00 | 217 // t1 = | 31 | 21 | 11 | 01 | 218 // t8 = | 32 | 22 | 12 | 02 | 219 // t9 = | 33 | 23 | 13 | 03 | 220 221 "precr.qb.ph $s0, $t1, $t0 \n" 222 "precr.qb.ph $s1, $t9, $t8 \n" 223 "precrq.qb.ph $s2, $t1, $t0 \n" 224 "precrq.qb.ph $s3, $t9, $t8 \n" 225 226 // s0 = | 21 | 01 | 20 | 00 | 227 // s1 = | 23 | 03 | 22 | 02 | 228 // s2 = | 31 | 11 | 30 | 10 | 229 // s3 = | 33 | 13 | 32 | 12 | 230 231 "precr.qb.ph $s4, $s1, $s0 \n" 232 "precrq.qb.ph $s5, $s1, $s0 \n" 233 "precr.qb.ph $s6, $s3, $s2 \n" 234 "precrq.qb.ph $s7, $s3, $s2 \n" 235 236 // s4 = | 03 | 02 | 01 | 00 | 237 // s5 = | 23 | 22 | 21 | 20 | 238 // s6 = | 13 | 12 | 11 | 10 | 239 // s7 = | 33 | 32 | 31 | 30 | 240 241 "lwx $t0, $t4(%[src]) \n" 242 "lwx $t1, $t5(%[src]) \n" 243 "lwx $t8, $t6(%[src]) \n" 244 "lwx $t9, $t7(%[src]) \n" 245 246 // t0 = | 34 | 24 | 14 | 04 | 247 // t1 = | 35 | 25 | 15 | 05 | 248 // t8 = | 36 | 26 | 16 | 06 | 249 // t9 = | 37 | 27 | 17 | 07 | 250 251 "precr.qb.ph $s0, $t1, $t0 \n" 252 "precr.qb.ph $s1, $t9, $t8 \n" 253 "precrq.qb.ph $s2, $t1, $t0 \n" 254 "precrq.qb.ph $s3, $t9, $t8 \n" 255 256 // s0 = | 25 | 05 | 24 | 04 | 257 // s1 = | 27 | 07 | 26 | 06 | 258 // s2 = | 35 | 15 | 34 | 14 | 259 // s3 = | 37 | 17 | 36 | 16 | 260 261 "precr.qb.ph $t0, $s1, $s0 \n" 262 "precrq.qb.ph $t1, $s1, $s0 \n" 263 "precr.qb.ph $t8, $s3, $s2 \n" 264 "precrq.qb.ph $t9, $s3, $s2 \n" 265 266 // t0 = | 07 | 06 | 05 | 04 | 267 // t1 = | 27 | 26 | 25 | 24 | 268 // t8 = | 17 | 16 | 15 | 14 | 269 // t9 = | 37 | 36 | 35 | 34 | 270 271 "addu $s0, %[dst], %[dst_stride] \n" 272 "addu $s1, $s0, %[dst_stride] \n" 273 "addu $s2, $s1, %[dst_stride] \n" 274 275 "swr $s4, 0(%[dst]) \n" 276 "swl $s4, 3(%[dst]) \n" 277 "swr $t0, 4(%[dst]) \n" 278 "swl $t0, 7(%[dst]) \n" 279 "swr $s6, 0($s0) \n" 280 "swl $s6, 3($s0) \n" 281 "swr $t8, 4($s0) \n" 282 "swl $t8, 7($s0) \n" 283 "swr $s5, 0($s1) \n" 284 "swl $s5, 3($s1) \n" 285 "swr $t1, 4($s1) \n" 286 "swl $t1, 7($s1) \n" 287 "swr $s7, 0($s2) \n" 288 "swl $s7, 3($s2) \n" 289 "swr $t9, 4($s2) \n" 290 "swl $t9, 7($s2) \n" 291 292 "addiu $AT, -1 \n" 293 "addiu %[src], 4 \n" 294 295 "bnez $AT, 11b \n" 296 " addu %[dst], $s2, %[dst_stride] \n" 297 "2: \n" 298 ".set pop \n" 299 ".set at \n" 300 : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width) 301 : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) 302 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", 303 "s2", "s3", "s4", "s5", "s6", "s7"); 304 } 305 306 void TransposeUVWx8_DSPR2(const uint8* src, 307 int src_stride, 308 uint8* dst_a, 309 int dst_stride_a, 310 uint8* dst_b, 311 int dst_stride_b, 312 int width) { 313 __asm__ __volatile__( 314 ".set push \n" 315 ".set noreorder \n" 316 "beqz %[width], 2f \n" 317 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 318 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 319 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 320 "addu $t3, $t2, %[src_stride] \n" 321 "addu $t5, $t4, %[src_stride] \n" 322 "addu $t6, $t2, $t4 \n" 323 "subu $t7, $t9, %[src_stride] \n" 324 "srl $t1, %[width], 1 \n" 325 326 // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b 327 "andi $t0, %[dst_a], 0x3 \n" 328 "andi $t8, %[dst_b], 0x3 \n" 329 "or $t0, $t0, $t8 \n" 330 "andi $t8, %[dst_stride_a], 0x3 \n" 331 "andi $s5, %[dst_stride_b], 0x3 \n" 332 "or $t8, $t8, $s5 \n" 333 "or $t0, $t0, $t8 \n" 334 "bnez $t0, 11f \n" 335 " nop \n" 336 // dst + dst_stride word aligned (both, a & b dst addresses) 337 "1: \n" 338 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| 339 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| 340 "addu $s5, %[dst_a], %[dst_stride_a] \n" 341 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| 342 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| 343 "addu $s6, %[dst_b], %[dst_stride_b] \n" 344 345 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| 346 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| 347 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| 348 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| 349 350 "sll $t0, $t0, 16 \n" 351 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| 352 "sll $t9, $t9, 16 \n" 353 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| 354 355 "sw $s3, 0($s5) \n" 356 "sw $s4, 0($s6) \n" 357 358 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| 359 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| 360 361 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| 362 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| 363 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| 364 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| 365 "sw $s3, 0(%[dst_a]) \n" 366 "sw $s4, 0(%[dst_b]) \n" 367 368 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| 369 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| 370 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| 371 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| 372 373 "sll $t0, $t0, 16 \n" 374 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| 375 "sll $t9, $t9, 16 \n" 376 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| 377 "sw $s3, 4($s5) \n" 378 "sw $s4, 4($s6) \n" 379 380 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| 381 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| 382 383 "addiu %[src], 4 \n" 384 "addiu $t1, -1 \n" 385 "sll $t0, %[dst_stride_a], 1 \n" 386 "sll $t8, %[dst_stride_b], 1 \n" 387 "sw $s3, 4(%[dst_a]) \n" 388 "sw $s4, 4(%[dst_b]) \n" 389 "addu %[dst_a], %[dst_a], $t0 \n" 390 "bnez $t1, 1b \n" 391 " addu %[dst_b], %[dst_b], $t8 \n" 392 "b 2f \n" 393 " nop \n" 394 395 // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned 396 "11: \n" 397 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| 398 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| 399 "addu $s5, %[dst_a], %[dst_stride_a] \n" 400 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| 401 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| 402 "addu $s6, %[dst_b], %[dst_stride_b] \n" 403 404 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| 405 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| 406 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| 407 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| 408 409 "sll $t0, $t0, 16 \n" 410 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| 411 "sll $t9, $t9, 16 \n" 412 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| 413 414 "swr $s3, 0($s5) \n" 415 "swl $s3, 3($s5) \n" 416 "swr $s4, 0($s6) \n" 417 "swl $s4, 3($s6) \n" 418 419 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| 420 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| 421 422 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| 423 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| 424 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| 425 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| 426 "swr $s3, 0(%[dst_a]) \n" 427 "swl $s3, 3(%[dst_a]) \n" 428 "swr $s4, 0(%[dst_b]) \n" 429 "swl $s4, 3(%[dst_b]) \n" 430 431 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| 432 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| 433 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| 434 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| 435 436 "sll $t0, $t0, 16 \n" 437 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| 438 "sll $t9, $t9, 16 \n" 439 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| 440 441 "swr $s3, 4($s5) \n" 442 "swl $s3, 7($s5) \n" 443 "swr $s4, 4($s6) \n" 444 "swl $s4, 7($s6) \n" 445 446 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| 447 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| 448 449 "addiu %[src], 4 \n" 450 "addiu $t1, -1 \n" 451 "sll $t0, %[dst_stride_a], 1 \n" 452 "sll $t8, %[dst_stride_b], 1 \n" 453 "swr $s3, 4(%[dst_a]) \n" 454 "swl $s3, 7(%[dst_a]) \n" 455 "swr $s4, 4(%[dst_b]) \n" 456 "swl $s4, 7(%[dst_b]) \n" 457 "addu %[dst_a], %[dst_a], $t0 \n" 458 "bnez $t1, 11b \n" 459 " addu %[dst_b], %[dst_b], $t8 \n" 460 461 "2: \n" 462 ".set pop \n" 463 : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b), 464 [width] "+r"(width), [src_stride] "+r"(src_stride) 465 : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b) 466 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", 467 "s2", "s3", "s4", "s5", "s6"); 468 } 469 470 #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) 471 472 #ifdef __cplusplus 473 } // extern "C" 474 } // namespace libyuv 475 #endif 476