Home | History | Annotate | Download | only in mips
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <assert.h>
     13 #include <stdio.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom_dsp/mips/convolve_common_dspr2.h"
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/aom_filter.h"
     20 #include "aom_ports/mem.h"
     21 
     22 #if HAVE_DSPR2
     23 void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
     24                              uint8_t *dst, ptrdiff_t dst_stride,
     25                              const int16_t *filter_x, int filter_x_stride,
     26                              const int16_t *filter_y, int filter_y_stride,
     27                              int w, int h) {
     28   int x, y;
     29 
     30   (void)filter_x;
     31   (void)filter_x_stride;
     32   (void)filter_y;
     33   (void)filter_y_stride;
     34 
     35   /* prefetch data to cache memory */
     36   prefetch_load(src);
     37   prefetch_load(src + 32);
     38   prefetch_store(dst);
     39 
     40   switch (w) {
     41     case 4: {
     42       uint32_t tp1;
     43 
     44       /* 1 word storage */
     45       for (y = h; y--;) {
     46         prefetch_load(src + src_stride);
     47         prefetch_load(src + src_stride + 32);
     48         prefetch_store(dst + dst_stride);
     49 
     50         __asm__ __volatile__(
     51             "ulw              %[tp1],         (%[src])      \n\t"
     52             "sw               %[tp1],         (%[dst])      \n\t" /* store */
     53 
     54             : [tp1] "=&r"(tp1)
     55             : [src] "r"(src), [dst] "r"(dst));
     56 
     57         src += src_stride;
     58         dst += dst_stride;
     59       }
     60     } break;
     61     case 8: {
     62       uint32_t tp1, tp2;
     63 
     64       /* 2 word storage */
     65       for (y = h; y--;) {
     66         prefetch_load(src + src_stride);
     67         prefetch_load(src + src_stride + 32);
     68         prefetch_store(dst + dst_stride);
     69 
     70         __asm__ __volatile__(
     71             "ulw              %[tp1],         0(%[src])      \n\t"
     72             "ulw              %[tp2],         4(%[src])      \n\t"
     73             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
     74             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
     75 
     76             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
     77             : [src] "r"(src), [dst] "r"(dst));
     78 
     79         src += src_stride;
     80         dst += dst_stride;
     81       }
     82     } break;
     83     case 16: {
     84       uint32_t tp1, tp2, tp3, tp4;
     85 
     86       /* 4 word storage */
     87       for (y = h; y--;) {
     88         prefetch_load(src + src_stride);
     89         prefetch_load(src + src_stride + 32);
     90         prefetch_store(dst + dst_stride);
     91 
     92         __asm__ __volatile__(
     93             "ulw              %[tp1],         0(%[src])      \n\t"
     94             "ulw              %[tp2],         4(%[src])      \n\t"
     95             "ulw              %[tp3],         8(%[src])      \n\t"
     96             "ulw              %[tp4],         12(%[src])     \n\t"
     97 
     98             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
     99             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
    100             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
    101             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
    102 
    103             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
    104               [tp4] "=&r"(tp4)
    105             : [src] "r"(src), [dst] "r"(dst));
    106 
    107         src += src_stride;
    108         dst += dst_stride;
    109       }
    110     } break;
    111     case 32: {
    112       uint32_t tp1, tp2, tp3, tp4;
    113       uint32_t tp5, tp6, tp7, tp8;
    114 
    115       /* 8 word storage */
    116       for (y = h; y--;) {
    117         prefetch_load(src + src_stride);
    118         prefetch_load(src + src_stride + 32);
    119         prefetch_store(dst + dst_stride);
    120 
    121         __asm__ __volatile__(
    122             "ulw              %[tp1],         0(%[src])      \n\t"
    123             "ulw              %[tp2],         4(%[src])      \n\t"
    124             "ulw              %[tp3],         8(%[src])      \n\t"
    125             "ulw              %[tp4],         12(%[src])     \n\t"
    126             "ulw              %[tp5],         16(%[src])     \n\t"
    127             "ulw              %[tp6],         20(%[src])     \n\t"
    128             "ulw              %[tp7],         24(%[src])     \n\t"
    129             "ulw              %[tp8],         28(%[src])     \n\t"
    130 
    131             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
    132             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
    133             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
    134             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
    135             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
    136             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
    137             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
    138             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
    139 
    140             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
    141               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
    142               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
    143             : [src] "r"(src), [dst] "r"(dst));
    144 
    145         src += src_stride;
    146         dst += dst_stride;
    147       }
    148     } break;
    149     case 64: {
    150       uint32_t tp1, tp2, tp3, tp4;
    151       uint32_t tp5, tp6, tp7, tp8;
    152 
    153       prefetch_load(src + 64);
    154       prefetch_store(dst + 32);
    155 
    156       /* 16 word storage */
    157       for (y = h; y--;) {
    158         prefetch_load(src + src_stride);
    159         prefetch_load(src + src_stride + 32);
    160         prefetch_load(src + src_stride + 64);
    161         prefetch_store(dst + dst_stride);
    162         prefetch_store(dst + dst_stride + 32);
    163 
    164         __asm__ __volatile__(
    165             "ulw              %[tp1],         0(%[src])      \n\t"
    166             "ulw              %[tp2],         4(%[src])      \n\t"
    167             "ulw              %[tp3],         8(%[src])      \n\t"
    168             "ulw              %[tp4],         12(%[src])     \n\t"
    169             "ulw              %[tp5],         16(%[src])     \n\t"
    170             "ulw              %[tp6],         20(%[src])     \n\t"
    171             "ulw              %[tp7],         24(%[src])     \n\t"
    172             "ulw              %[tp8],         28(%[src])     \n\t"
    173 
    174             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
    175             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
    176             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
    177             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
    178             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
    179             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
    180             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
    181             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
    182 
    183             "ulw              %[tp1],         32(%[src])     \n\t"
    184             "ulw              %[tp2],         36(%[src])     \n\t"
    185             "ulw              %[tp3],         40(%[src])     \n\t"
    186             "ulw              %[tp4],         44(%[src])     \n\t"
    187             "ulw              %[tp5],         48(%[src])     \n\t"
    188             "ulw              %[tp6],         52(%[src])     \n\t"
    189             "ulw              %[tp7],         56(%[src])     \n\t"
    190             "ulw              %[tp8],         60(%[src])     \n\t"
    191 
    192             "sw               %[tp1],         32(%[dst])     \n\t" /* store */
    193             "sw               %[tp2],         36(%[dst])     \n\t" /* store */
    194             "sw               %[tp3],         40(%[dst])     \n\t" /* store */
    195             "sw               %[tp4],         44(%[dst])     \n\t" /* store */
    196             "sw               %[tp5],         48(%[dst])     \n\t" /* store */
    197             "sw               %[tp6],         52(%[dst])     \n\t" /* store */
    198             "sw               %[tp7],         56(%[dst])     \n\t" /* store */
    199             "sw               %[tp8],         60(%[dst])     \n\t" /* store */
    200 
    201             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
    202               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
    203               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
    204             : [src] "r"(src), [dst] "r"(dst));
    205 
    206         src += src_stride;
    207         dst += dst_stride;
    208       }
    209     } break;
    210     default:
    211       for (y = h; y--;) {
    212         for (x = 0; x < w; ++x) {
    213           dst[x] = src[x];
    214         }
    215 
    216         src += src_stride;
    217         dst += dst_stride;
    218       }
    219       break;
    220   }
    221 }
    222 #endif
    223