1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <stdio.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom_dsp/mips/convolve_common_dspr2.h" 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/aom_filter.h" 20 #include "aom_ports/mem.h" 21 22 #if HAVE_DSPR2 23 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, 24 uint8_t *dst, ptrdiff_t dst_stride, 25 const int16_t *filter_x, int filter_x_stride, 26 const int16_t *filter_y, int filter_y_stride, 27 int w, int h) { 28 int x, y; 29 30 (void)filter_x; 31 (void)filter_x_stride; 32 (void)filter_y; 33 (void)filter_y_stride; 34 35 /* prefetch data to cache memory */ 36 prefetch_load(src); 37 prefetch_load(src + 32); 38 prefetch_store(dst); 39 40 switch (w) { 41 case 4: { 42 uint32_t tp1; 43 44 /* 1 word storage */ 45 for (y = h; y--;) { 46 prefetch_load(src + src_stride); 47 prefetch_load(src + src_stride + 32); 48 prefetch_store(dst + dst_stride); 49 50 __asm__ __volatile__( 51 "ulw %[tp1], (%[src]) \n\t" 52 "sw %[tp1], (%[dst]) \n\t" /* store */ 53 54 : [tp1] "=&r"(tp1) 55 : [src] "r"(src), [dst] "r"(dst)); 56 57 src += src_stride; 58 dst += dst_stride; 59 } 60 } break; 61 case 8: { 62 uint32_t tp1, tp2; 63 64 /* 2 word storage */ 65 for (y = h; y--;) { 66 prefetch_load(src + src_stride); 67 prefetch_load(src + src_stride + 32); 68 prefetch_store(dst + dst_stride); 69 70 __asm__ __volatile__( 71 "ulw %[tp1], 0(%[src]) \n\t" 72 "ulw %[tp2], 4(%[src]) \n\t" 73 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 74 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 75 76 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) 77 : [src] "r"(src), [dst] "r"(dst)); 78 79 src += src_stride; 80 dst += dst_stride; 81 } 82 } break; 83 case 16: { 84 uint32_t tp1, tp2, tp3, tp4; 85 86 /* 4 word storage */ 87 for (y = h; y--;) { 88 prefetch_load(src + src_stride); 89 prefetch_load(src + src_stride + 32); 90 prefetch_store(dst + dst_stride); 91 92 __asm__ __volatile__( 93 "ulw %[tp1], 0(%[src]) \n\t" 94 "ulw %[tp2], 4(%[src]) \n\t" 95 "ulw %[tp3], 8(%[src]) \n\t" 96 "ulw %[tp4], 12(%[src]) \n\t" 97 98 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 99 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 100 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 101 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 102 103 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 104 [tp4] "=&r"(tp4) 105 : [src] "r"(src), [dst] "r"(dst)); 106 107 src += src_stride; 108 dst += dst_stride; 109 } 110 } break; 111 case 32: { 112 uint32_t tp1, tp2, tp3, tp4; 113 uint32_t tp5, tp6, tp7, tp8; 114 115 /* 8 word storage */ 116 for (y = h; y--;) { 117 prefetch_load(src + src_stride); 118 prefetch_load(src + src_stride + 32); 119 prefetch_store(dst + dst_stride); 120 121 __asm__ __volatile__( 122 "ulw %[tp1], 0(%[src]) \n\t" 123 "ulw %[tp2], 4(%[src]) \n\t" 124 "ulw %[tp3], 8(%[src]) \n\t" 125 "ulw %[tp4], 12(%[src]) \n\t" 126 "ulw %[tp5], 16(%[src]) \n\t" 127 "ulw %[tp6], 20(%[src]) \n\t" 128 "ulw %[tp7], 24(%[src]) \n\t" 129 "ulw %[tp8], 28(%[src]) \n\t" 130 131 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 132 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 133 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 134 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 135 "sw %[tp5], 16(%[dst]) \n\t" /* store */ 136 "sw %[tp6], 20(%[dst]) \n\t" /* store */ 137 "sw %[tp7], 24(%[dst]) \n\t" /* store */ 138 "sw %[tp8], 28(%[dst]) \n\t" /* store */ 139 140 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 141 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), 142 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) 143 : [src] "r"(src), [dst] "r"(dst)); 144 145 src += src_stride; 146 dst += dst_stride; 147 } 148 } break; 149 case 64: { 150 uint32_t tp1, tp2, tp3, tp4; 151 uint32_t tp5, tp6, tp7, tp8; 152 153 prefetch_load(src + 64); 154 prefetch_store(dst + 32); 155 156 /* 16 word storage */ 157 for (y = h; y--;) { 158 prefetch_load(src + src_stride); 159 prefetch_load(src + src_stride + 32); 160 prefetch_load(src + src_stride + 64); 161 prefetch_store(dst + dst_stride); 162 prefetch_store(dst + dst_stride + 32); 163 164 __asm__ __volatile__( 165 "ulw %[tp1], 0(%[src]) \n\t" 166 "ulw %[tp2], 4(%[src]) \n\t" 167 "ulw %[tp3], 8(%[src]) \n\t" 168 "ulw %[tp4], 12(%[src]) \n\t" 169 "ulw %[tp5], 16(%[src]) \n\t" 170 "ulw %[tp6], 20(%[src]) \n\t" 171 "ulw %[tp7], 24(%[src]) \n\t" 172 "ulw %[tp8], 28(%[src]) \n\t" 173 174 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 175 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 176 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 177 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 178 "sw %[tp5], 16(%[dst]) \n\t" /* store */ 179 "sw %[tp6], 20(%[dst]) \n\t" /* store */ 180 "sw %[tp7], 24(%[dst]) \n\t" /* store */ 181 "sw %[tp8], 28(%[dst]) \n\t" /* store */ 182 183 "ulw %[tp1], 32(%[src]) \n\t" 184 "ulw %[tp2], 36(%[src]) \n\t" 185 "ulw %[tp3], 40(%[src]) \n\t" 186 "ulw %[tp4], 44(%[src]) \n\t" 187 "ulw %[tp5], 48(%[src]) \n\t" 188 "ulw %[tp6], 52(%[src]) \n\t" 189 "ulw %[tp7], 56(%[src]) \n\t" 190 "ulw %[tp8], 60(%[src]) \n\t" 191 192 "sw %[tp1], 32(%[dst]) \n\t" /* store */ 193 "sw %[tp2], 36(%[dst]) \n\t" /* store */ 194 "sw %[tp3], 40(%[dst]) \n\t" /* store */ 195 "sw %[tp4], 44(%[dst]) \n\t" /* store */ 196 "sw %[tp5], 48(%[dst]) \n\t" /* store */ 197 "sw %[tp6], 52(%[dst]) \n\t" /* store */ 198 "sw %[tp7], 56(%[dst]) \n\t" /* store */ 199 "sw %[tp8], 60(%[dst]) \n\t" /* store */ 200 201 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 202 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), 203 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) 204 : [src] "r"(src), [dst] "r"(dst)); 205 206 src += src_stride; 207 dst += dst_stride; 208 } 209 } break; 210 default: 211 for (y = h; y--;) { 212 for (x = 0; x < w; ++x) { 213 dst[x] = src[x]; 214 } 215 216 src += src_stride; 217 dst += dst_stride; 218 } 219 break; 220 } 221 } 222 #endif 223