Home | History | Annotate | Download | only in examples
      1 // Copyright 2015, ARM Limited
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are met:
      6 //
      7 //   * Redistributions of source code must retain the above copyright notice,
      8 //     this list of conditions and the following disclaimer.
      9 //   * Redistributions in binary form must reproduce the above copyright notice,
     10 //     this list of conditions and the following disclaimer in the documentation
     11 //     and/or other materials provided with the distribution.
     12 //   * Neither the name of ARM Limited nor the names of its contributors may be
     13 //     used to endorse or promote products derived from this software without
     14 //     specific prior written permission.
     15 //
     16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
     17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
     20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
     23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
     24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 
     27 #include "examples.h"
     28 
     29 #define BUF_SIZE (4096)
     30 #define __ masm->
     31 
     32 // A vector by scalar multiply helper routine to generate code for
     33 // the multiplication of each column of the resulting 4x4 matrix.
     34 // This function provides a template for the following pattern:
     35 //
     36 // __ Fmul(v<v_out>.V4S(), v4.V4S(),  v<s_column>.S(), 0);
     37 // __ Fmla(v<v_out>.V4S(), v5.V4S(),  v<s_column>.S(), 1);
     38 // __ Fmla(v<v_out>.V4S(), v6.V4S(),  v<s_column>.S(), 2);
     39 // __ Fmla(v<v_out>.V4S(), v7.V4S(),  v<s_column>.S(), 3);
     40 //
     41 // v<v_out> corresponds to a column of the output matrix (v0, v1, v2 or v3).
     42 // v<s_column> corresponds to a column of the 2nd input (v16, v17, v18 or v19).
     43 //
     44 static void GenerateMultiplyColumn(MacroAssembler* masm,
     45                                    unsigned out_column,
     46                                    unsigned in_column) {
     47   // 'v_out' splits a Q register into 4 lanes of 32 bits each.
     48   VRegister v_out = VRegister(out_column, kQRegSize, 4);
     49   // 'v_in' refers to a single 32 bit 'S' lane.
     50   VRegister v_in = VRegister(in_column, kSRegSize);
     51 
     52   __ Fmul(v_out, v4.V4S(), v_in, 0);  // e.g. (v0.V4S(), v4.V4S(),  v8.S(), 0).
     53   __ Fmla(v_out, v5.V4S(), v_in, 1);
     54   __ Fmla(v_out, v6.V4S(), v_in, 2);
     55   __ Fmla(v_out, v7.V4S(), v_in, 3);
     56 }
     57 
     58 void GenerateNEONMatrixMultiply(MacroAssembler* masm) {
     59   // Argument location:
     60   //   dst  -> x0
     61   //   mat1 -> x1
     62   //   mat2 -> x2
     63 
     64   Label end;
     65 
     66   __ And(x3, x0, x1);
     67   __ And(x3, x3, x2);
     68   __ Cbz(x3, &end);  // Nothing to do if an input is null.
     69 
     70   // Load the first matrix into v4, v5, v6 and v7.
     71   __ Ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1));
     72   // Load the first matrix into v16, v17, v18 and v19.
     73   __ Ld1(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x2));
     74 
     75   // Initialise vectors of the output matrix with zeros.
     76   // This is only for the purposes of showing how this can be achived
     77   // but technically this is not required because we overwrite all lanes
     78   // of the output vectors.
     79   __ Movi(v0.V16B(), 0);
     80   __ Movi(v1.V16B(), 0);
     81   __ Movi(v2.V16B(), 0);
     82   __ Movi(v3.V16B(), 0);
     83 
     84   GenerateMultiplyColumn(masm, 0, 16);
     85   GenerateMultiplyColumn(masm, 1, 17);
     86   GenerateMultiplyColumn(masm, 2, 18);
     87   GenerateMultiplyColumn(masm, 3, 19);
     88 
     89   // Store the resulting matrix from v0, v1, v2 and v3.
     90   __ St1(v0.V4S(), v1.V4S(), v2.V4S(), v3.V4S(), MemOperand(x0));
     91 
     92   __ Bind(&end);
     93   __ Ret();
     94 }
     95 
     96 
     97 #ifndef TEST_EXAMPLES
     98 #ifdef VIXL_INCLUDE_SIMULATOR
     99 int main(void) {
    100   // Create and initialize the assembler and the simulator.
    101   byte assm_buf[BUF_SIZE];
    102   MacroAssembler masm(assm_buf, BUF_SIZE);
    103   Decoder decoder;
    104   Simulator simulator(&decoder);
    105 
    106   // Generate the code for the example function.
    107   Label neon_matrix_multiply;
    108   masm.Bind(&neon_matrix_multiply);
    109   GenerateNEONMatrixMultiply(&masm);
    110   masm.FinalizeCode();
    111 
    112   // Define the required variables and run the example function.
    113   const int kRowSize = 4;
    114   const int kColSize = 4;
    115   const int kLength = kRowSize * kColSize;
    116 
    117   float mat1[kLength], mat2[kLength], output[kLength];
    118 
    119   // Initialise the output matrix to the zero matrix.
    120   memset(output, 0, sizeof(output[0]) * kLength);
    121 
    122   // Fill the two input matrices with some 32 bit floating point values.
    123   // Array initialisation using curly brackets is also possible like so:
    124   //   float mat1[kLength] = { 1.0f, 52.03f, 4.43f, ... };
    125   // However, the following way better shows the "column-major" arrangement.
    126 
    127   mat1[0] =   1.0f; mat1[4] =   2.0f; mat1[ 8] =   3.0f; mat1[12] =   4.0f;
    128   mat1[1] = 52.03f; mat1[5] = 12.24f; mat1[ 9] = 53.56f; mat1[13] = 22.22f;
    129   mat1[2] =  4.43f; mat1[6] =  5.00f; mat1[10] =  7.00f; mat1[14] =  3.11f;
    130   mat1[3] = 43.47f; mat1[7] = 10.97f; mat1[11] = 37.78f; mat1[15] = 90.91f;
    131 
    132   mat2[0] =   1.0f; mat2[4] = 11.24f; mat2[ 8] = 21.00f; mat2[12] = 21.31f;
    133   mat2[1] =   2.0f; mat2[5] =  2.24f; mat2[ 9] =  8.56f; mat2[13] = 52.03f;
    134   mat2[2] =   3.0f; mat2[6] = 51.00f; mat2[10] = 21.00f; mat2[14] = 33.11f;
    135   mat2[3] =   4.0f; mat2[7] =  0.00f; mat2[11] = 84.00f; mat2[15] =  1.97f;
    136 
    137   simulator.ResetState();
    138   simulator.set_xreg(0, reinterpret_cast<uintptr_t>(output));
    139   simulator.set_xreg(1, reinterpret_cast<uintptr_t>(mat1));
    140   simulator.set_xreg(2, reinterpret_cast<uintptr_t>(mat2));
    141   simulator.RunFrom(masm.GetLabelAddress<Instruction*>(&neon_matrix_multiply));
    142 
    143   // Print the 4x4 output matrix along with both 4x4 input matrices.
    144   for (int i = 0; i < kRowSize; i++) {
    145     printf("| %8.2f %8.2f %8.2f %8.2f |   "
    146            "| %8.2f %8.2f %8.2f %8.2f |       "
    147            "| %8.2f %8.2f %8.2f %8.2f |\n",
    148              mat1[i],   mat1[4+i],   mat1[8+i],   mat1[12+i],
    149              mat2[i],   mat2[4+i],   mat2[8+i],   mat2[12+i],
    150            output[i], output[4+i], output[8+i], output[12+i]);
    151     if (i == 0 || i == 2) {
    152       printf("|                                     |   "
    153              "|                                     |       "
    154              "|                                     |\n");
    155     } else if (i == 1) {
    156       printf("|                                     | x "
    157              "|                                     |   =   "
    158              "|                                     |\n");
    159     }
    160   }
    161 
    162   return 0;
    163 }
    164 #else
    165 // Without the simulator there is nothing to test.
    166 int main(void) { return 0; }
    167 #endif  // VIXL_INCLUDE_SIMULATOR
    168 #endif  // TEST_EXAMPLES
    169