1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/rotate_row.h" 13 14 #include "libyuv/basic_types.h" 15 16 #ifdef __cplusplus 17 namespace libyuv { 18 extern "C" { 19 #endif 20 21 #if !defined(LIBYUV_DISABLE_MIPS) && \ 22 defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ 23 (_MIPS_SIM == _MIPS_SIM_ABI32) 24 25 void TransposeWx8_DSPR2(const uint8* src, int src_stride, 26 uint8* dst, int dst_stride, int width) { 27 __asm__ __volatile__ ( 28 ".set push \n" 29 ".set noreorder \n" 30 "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 31 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 32 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 33 "addu $t3, $t2, %[src_stride] \n" 34 "addu $t5, $t4, %[src_stride] \n" 35 "addu $t6, $t2, $t4 \n" 36 "andi $t0, %[dst], 0x3 \n" 37 "andi $t1, %[dst_stride], 0x3 \n" 38 "or $t0, $t0, $t1 \n" 39 "bnez $t0, 11f \n" 40 " subu $t7, $t9, %[src_stride] \n" 41 //dst + dst_stride word aligned 42 "1: \n" 43 "lbu $t0, 0(%[src]) \n" 44 "lbux $t1, %[src_stride](%[src]) \n" 45 "lbux $t8, $t2(%[src]) \n" 46 "lbux $t9, $t3(%[src]) \n" 47 "sll $t1, $t1, 16 \n" 48 "sll $t9, $t9, 16 \n" 49 "or $t0, $t0, $t1 \n" 50 "or $t8, $t8, $t9 \n" 51 "precr.qb.ph $s0, $t8, $t0 \n" 52 "lbux $t0, $t4(%[src]) \n" 53 "lbux $t1, $t5(%[src]) \n" 54 "lbux $t8, $t6(%[src]) \n" 55 "lbux $t9, $t7(%[src]) \n" 56 "sll $t1, $t1, 16 \n" 57 "sll $t9, $t9, 16 \n" 58 "or $t0, $t0, $t1 \n" 59 "or $t8, $t8, $t9 \n" 60 "precr.qb.ph $s1, $t8, $t0 \n" 61 "sw $s0, 0(%[dst]) \n" 62 "addiu %[width], -1 \n" 63 "addiu %[src], 1 \n" 64 "sw $s1, 4(%[dst]) \n" 65 "bnez %[width], 1b \n" 66 " addu %[dst], %[dst], %[dst_stride] \n" 67 "b 2f \n" 68 //dst + dst_stride unaligned 69 "11: \n" 70 "lbu $t0, 0(%[src]) \n" 71 "lbux $t1, %[src_stride](%[src]) \n" 72 "lbux $t8, $t2(%[src]) \n" 73 "lbux $t9, $t3(%[src]) \n" 74 "sll $t1, $t1, 16 \n" 75 "sll $t9, $t9, 16 \n" 76 "or $t0, $t0, $t1 \n" 77 "or $t8, $t8, $t9 \n" 78 "precr.qb.ph $s0, $t8, $t0 \n" 79 "lbux $t0, $t4(%[src]) \n" 80 "lbux $t1, $t5(%[src]) \n" 81 "lbux $t8, $t6(%[src]) \n" 82 "lbux $t9, $t7(%[src]) \n" 83 "sll $t1, $t1, 16 \n" 84 "sll $t9, $t9, 16 \n" 85 "or $t0, $t0, $t1 \n" 86 "or $t8, $t8, $t9 \n" 87 "precr.qb.ph $s1, $t8, $t0 \n" 88 "swr $s0, 0(%[dst]) \n" 89 "swl $s0, 3(%[dst]) \n" 90 "addiu %[width], -1 \n" 91 "addiu %[src], 1 \n" 92 "swr $s1, 4(%[dst]) \n" 93 "swl $s1, 7(%[dst]) \n" 94 "bnez %[width], 11b \n" 95 "addu %[dst], %[dst], %[dst_stride] \n" 96 "2: \n" 97 ".set pop \n" 98 :[src] "+r" (src), 99 [dst] "+r" (dst), 100 [width] "+r" (width) 101 :[src_stride] "r" (src_stride), 102 [dst_stride] "r" (dst_stride) 103 : "t0", "t1", "t2", "t3", "t4", "t5", 104 "t6", "t7", "t8", "t9", 105 "s0", "s1" 106 ); 107 } 108 109 void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, 110 uint8* dst, int dst_stride, int width) { 111 __asm__ __volatile__ ( 112 ".set noat \n" 113 ".set push \n" 114 ".set noreorder \n" 115 "beqz %[width], 2f \n" 116 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 117 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 118 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 119 "addu $t3, $t2, %[src_stride] \n" 120 "addu $t5, $t4, %[src_stride] \n" 121 "addu $t6, $t2, $t4 \n" 122 123 "srl $AT, %[width], 0x2 \n" 124 "andi $t0, %[dst], 0x3 \n" 125 "andi $t1, %[dst_stride], 0x3 \n" 126 "or $t0, $t0, $t1 \n" 127 "bnez $t0, 11f \n" 128 " subu $t7, $t9, %[src_stride] \n" 129 //dst + dst_stride word aligned 130 "1: \n" 131 "lw $t0, 0(%[src]) \n" 132 "lwx $t1, %[src_stride](%[src]) \n" 133 "lwx $t8, $t2(%[src]) \n" 134 "lwx $t9, $t3(%[src]) \n" 135 136 // t0 = | 30 | 20 | 10 | 00 | 137 // t1 = | 31 | 21 | 11 | 01 | 138 // t8 = | 32 | 22 | 12 | 02 | 139 // t9 = | 33 | 23 | 13 | 03 | 140 141 "precr.qb.ph $s0, $t1, $t0 \n" 142 "precr.qb.ph $s1, $t9, $t8 \n" 143 "precrq.qb.ph $s2, $t1, $t0 \n" 144 "precrq.qb.ph $s3, $t9, $t8 \n" 145 146 // s0 = | 21 | 01 | 20 | 00 | 147 // s1 = | 23 | 03 | 22 | 02 | 148 // s2 = | 31 | 11 | 30 | 10 | 149 // s3 = | 33 | 13 | 32 | 12 | 150 151 "precr.qb.ph $s4, $s1, $s0 \n" 152 "precrq.qb.ph $s5, $s1, $s0 \n" 153 "precr.qb.ph $s6, $s3, $s2 \n" 154 "precrq.qb.ph $s7, $s3, $s2 \n" 155 156 // s4 = | 03 | 02 | 01 | 00 | 157 // s5 = | 23 | 22 | 21 | 20 | 158 // s6 = | 13 | 12 | 11 | 10 | 159 // s7 = | 33 | 32 | 31 | 30 | 160 161 "lwx $t0, $t4(%[src]) \n" 162 "lwx $t1, $t5(%[src]) \n" 163 "lwx $t8, $t6(%[src]) \n" 164 "lwx $t9, $t7(%[src]) \n" 165 166 // t0 = | 34 | 24 | 14 | 04 | 167 // t1 = | 35 | 25 | 15 | 05 | 168 // t8 = | 36 | 26 | 16 | 06 | 169 // t9 = | 37 | 27 | 17 | 07 | 170 171 "precr.qb.ph $s0, $t1, $t0 \n" 172 "precr.qb.ph $s1, $t9, $t8 \n" 173 "precrq.qb.ph $s2, $t1, $t0 \n" 174 "precrq.qb.ph $s3, $t9, $t8 \n" 175 176 // s0 = | 25 | 05 | 24 | 04 | 177 // s1 = | 27 | 07 | 26 | 06 | 178 // s2 = | 35 | 15 | 34 | 14 | 179 // s3 = | 37 | 17 | 36 | 16 | 180 181 "precr.qb.ph $t0, $s1, $s0 \n" 182 "precrq.qb.ph $t1, $s1, $s0 \n" 183 "precr.qb.ph $t8, $s3, $s2 \n" 184 "precrq.qb.ph $t9, $s3, $s2 \n" 185 186 // t0 = | 07 | 06 | 05 | 04 | 187 // t1 = | 27 | 26 | 25 | 24 | 188 // t8 = | 17 | 16 | 15 | 14 | 189 // t9 = | 37 | 36 | 35 | 34 | 190 191 "addu $s0, %[dst], %[dst_stride] \n" 192 "addu $s1, $s0, %[dst_stride] \n" 193 "addu $s2, $s1, %[dst_stride] \n" 194 195 "sw $s4, 0(%[dst]) \n" 196 "sw $t0, 4(%[dst]) \n" 197 "sw $s6, 0($s0) \n" 198 "sw $t8, 4($s0) \n" 199 "sw $s5, 0($s1) \n" 200 "sw $t1, 4($s1) \n" 201 "sw $s7, 0($s2) \n" 202 "sw $t9, 4($s2) \n" 203 204 "addiu $AT, -1 \n" 205 "addiu %[src], 4 \n" 206 207 "bnez $AT, 1b \n" 208 " addu %[dst], $s2, %[dst_stride] \n" 209 "b 2f \n" 210 //dst + dst_stride unaligned 211 "11: \n" 212 "lw $t0, 0(%[src]) \n" 213 "lwx $t1, %[src_stride](%[src]) \n" 214 "lwx $t8, $t2(%[src]) \n" 215 "lwx $t9, $t3(%[src]) \n" 216 217 // t0 = | 30 | 20 | 10 | 00 | 218 // t1 = | 31 | 21 | 11 | 01 | 219 // t8 = | 32 | 22 | 12 | 02 | 220 // t9 = | 33 | 23 | 13 | 03 | 221 222 "precr.qb.ph $s0, $t1, $t0 \n" 223 "precr.qb.ph $s1, $t9, $t8 \n" 224 "precrq.qb.ph $s2, $t1, $t0 \n" 225 "precrq.qb.ph $s3, $t9, $t8 \n" 226 227 // s0 = | 21 | 01 | 20 | 00 | 228 // s1 = | 23 | 03 | 22 | 02 | 229 // s2 = | 31 | 11 | 30 | 10 | 230 // s3 = | 33 | 13 | 32 | 12 | 231 232 "precr.qb.ph $s4, $s1, $s0 \n" 233 "precrq.qb.ph $s5, $s1, $s0 \n" 234 "precr.qb.ph $s6, $s3, $s2 \n" 235 "precrq.qb.ph $s7, $s3, $s2 \n" 236 237 // s4 = | 03 | 02 | 01 | 00 | 238 // s5 = | 23 | 22 | 21 | 20 | 239 // s6 = | 13 | 12 | 11 | 10 | 240 // s7 = | 33 | 32 | 31 | 30 | 241 242 "lwx $t0, $t4(%[src]) \n" 243 "lwx $t1, $t5(%[src]) \n" 244 "lwx $t8, $t6(%[src]) \n" 245 "lwx $t9, $t7(%[src]) \n" 246 247 // t0 = | 34 | 24 | 14 | 04 | 248 // t1 = | 35 | 25 | 15 | 05 | 249 // t8 = | 36 | 26 | 16 | 06 | 250 // t9 = | 37 | 27 | 17 | 07 | 251 252 "precr.qb.ph $s0, $t1, $t0 \n" 253 "precr.qb.ph $s1, $t9, $t8 \n" 254 "precrq.qb.ph $s2, $t1, $t0 \n" 255 "precrq.qb.ph $s3, $t9, $t8 \n" 256 257 // s0 = | 25 | 05 | 24 | 04 | 258 // s1 = | 27 | 07 | 26 | 06 | 259 // s2 = | 35 | 15 | 34 | 14 | 260 // s3 = | 37 | 17 | 36 | 16 | 261 262 "precr.qb.ph $t0, $s1, $s0 \n" 263 "precrq.qb.ph $t1, $s1, $s0 \n" 264 "precr.qb.ph $t8, $s3, $s2 \n" 265 "precrq.qb.ph $t9, $s3, $s2 \n" 266 267 // t0 = | 07 | 06 | 05 | 04 | 268 // t1 = | 27 | 26 | 25 | 24 | 269 // t8 = | 17 | 16 | 15 | 14 | 270 // t9 = | 37 | 36 | 35 | 34 | 271 272 "addu $s0, %[dst], %[dst_stride] \n" 273 "addu $s1, $s0, %[dst_stride] \n" 274 "addu $s2, $s1, %[dst_stride] \n" 275 276 "swr $s4, 0(%[dst]) \n" 277 "swl $s4, 3(%[dst]) \n" 278 "swr $t0, 4(%[dst]) \n" 279 "swl $t0, 7(%[dst]) \n" 280 "swr $s6, 0($s0) \n" 281 "swl $s6, 3($s0) \n" 282 "swr $t8, 4($s0) \n" 283 "swl $t8, 7($s0) \n" 284 "swr $s5, 0($s1) \n" 285 "swl $s5, 3($s1) \n" 286 "swr $t1, 4($s1) \n" 287 "swl $t1, 7($s1) \n" 288 "swr $s7, 0($s2) \n" 289 "swl $s7, 3($s2) \n" 290 "swr $t9, 4($s2) \n" 291 "swl $t9, 7($s2) \n" 292 293 "addiu $AT, -1 \n" 294 "addiu %[src], 4 \n" 295 296 "bnez $AT, 11b \n" 297 " addu %[dst], $s2, %[dst_stride] \n" 298 "2: \n" 299 ".set pop \n" 300 ".set at \n" 301 :[src] "+r" (src), 302 [dst] "+r" (dst), 303 [width] "+r" (width) 304 :[src_stride] "r" (src_stride), 305 [dst_stride] "r" (dst_stride) 306 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", 307 "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" 308 ); 309 } 310 311 void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, 312 uint8* dst_a, int dst_stride_a, 313 uint8* dst_b, int dst_stride_b, 314 int width) { 315 __asm__ __volatile__ ( 316 ".set push \n" 317 ".set noreorder \n" 318 "beqz %[width], 2f \n" 319 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 320 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 321 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 322 "addu $t3, $t2, %[src_stride] \n" 323 "addu $t5, $t4, %[src_stride] \n" 324 "addu $t6, $t2, $t4 \n" 325 "subu $t7, $t9, %[src_stride] \n" 326 "srl $t1, %[width], 1 \n" 327 328 // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b 329 "andi $t0, %[dst_a], 0x3 \n" 330 "andi $t8, %[dst_b], 0x3 \n" 331 "or $t0, $t0, $t8 \n" 332 "andi $t8, %[dst_stride_a], 0x3 \n" 333 "andi $s5, %[dst_stride_b], 0x3 \n" 334 "or $t8, $t8, $s5 \n" 335 "or $t0, $t0, $t8 \n" 336 "bnez $t0, 11f \n" 337 " nop \n" 338 // dst + dst_stride word aligned (both, a & b dst addresses) 339 "1: \n" 340 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| 341 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| 342 "addu $s5, %[dst_a], %[dst_stride_a] \n" 343 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| 344 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| 345 "addu $s6, %[dst_b], %[dst_stride_b] \n" 346 347 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| 348 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| 349 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| 350 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| 351 352 "sll $t0, $t0, 16 \n" 353 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| 354 "sll $t9, $t9, 16 \n" 355 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| 356 357 "sw $s3, 0($s5) \n" 358 "sw $s4, 0($s6) \n" 359 360 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| 361 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| 362 363 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| 364 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| 365 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| 366 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| 367 "sw $s3, 0(%[dst_a]) \n" 368 "sw $s4, 0(%[dst_b]) \n" 369 370 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| 371 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| 372 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| 373 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| 374 375 "sll $t0, $t0, 16 \n" 376 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| 377 "sll $t9, $t9, 16 \n" 378 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| 379 "sw $s3, 4($s5) \n" 380 "sw $s4, 4($s6) \n" 381 382 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| 383 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| 384 385 "addiu %[src], 4 \n" 386 "addiu $t1, -1 \n" 387 "sll $t0, %[dst_stride_a], 1 \n" 388 "sll $t8, %[dst_stride_b], 1 \n" 389 "sw $s3, 4(%[dst_a]) \n" 390 "sw $s4, 4(%[dst_b]) \n" 391 "addu %[dst_a], %[dst_a], $t0 \n" 392 "bnez $t1, 1b \n" 393 " addu %[dst_b], %[dst_b], $t8 \n" 394 "b 2f \n" 395 " nop \n" 396 397 // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned 398 "11: \n" 399 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| 400 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| 401 "addu $s5, %[dst_a], %[dst_stride_a] \n" 402 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| 403 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| 404 "addu $s6, %[dst_b], %[dst_stride_b] \n" 405 406 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| 407 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| 408 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| 409 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| 410 411 "sll $t0, $t0, 16 \n" 412 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| 413 "sll $t9, $t9, 16 \n" 414 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| 415 416 "swr $s3, 0($s5) \n" 417 "swl $s3, 3($s5) \n" 418 "swr $s4, 0($s6) \n" 419 "swl $s4, 3($s6) \n" 420 421 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| 422 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| 423 424 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| 425 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| 426 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| 427 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| 428 "swr $s3, 0(%[dst_a]) \n" 429 "swl $s3, 3(%[dst_a]) \n" 430 "swr $s4, 0(%[dst_b]) \n" 431 "swl $s4, 3(%[dst_b]) \n" 432 433 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| 434 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| 435 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| 436 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| 437 438 "sll $t0, $t0, 16 \n" 439 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| 440 "sll $t9, $t9, 16 \n" 441 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| 442 443 "swr $s3, 4($s5) \n" 444 "swl $s3, 7($s5) \n" 445 "swr $s4, 4($s6) \n" 446 "swl $s4, 7($s6) \n" 447 448 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| 449 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| 450 451 "addiu %[src], 4 \n" 452 "addiu $t1, -1 \n" 453 "sll $t0, %[dst_stride_a], 1 \n" 454 "sll $t8, %[dst_stride_b], 1 \n" 455 "swr $s3, 4(%[dst_a]) \n" 456 "swl $s3, 7(%[dst_a]) \n" 457 "swr $s4, 4(%[dst_b]) \n" 458 "swl $s4, 7(%[dst_b]) \n" 459 "addu %[dst_a], %[dst_a], $t0 \n" 460 "bnez $t1, 11b \n" 461 " addu %[dst_b], %[dst_b], $t8 \n" 462 463 "2: \n" 464 ".set pop \n" 465 : [src] "+r" (src), 466 [dst_a] "+r" (dst_a), 467 [dst_b] "+r" (dst_b), 468 [width] "+r" (width), 469 [src_stride] "+r" (src_stride) 470 : [dst_stride_a] "r" (dst_stride_a), 471 [dst_stride_b] "r" (dst_stride_b) 472 : "t0", "t1", "t2", "t3", "t4", "t5", 473 "t6", "t7", "t8", "t9", 474 "s0", "s1", "s2", "s3", 475 "s4", "s5", "s6" 476 ); 477 } 478 479 #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) 480 481 #ifdef __cplusplus 482 } // extern "C" 483 } // namespace libyuv 484 #endif 485