1 // Copyright 2015, ARM Limited 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are met: 6 // 7 // * Redistributions of source code must retain the above copyright notice, 8 // this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above copyright notice, 10 // this list of conditions and the following disclaimer in the documentation 11 // and/or other materials provided with the distribution. 12 // * Neither the name of ARM Limited nor the names of its contributors may be 13 // used to endorse or promote products derived from this software without 14 // specific prior written permission. 15 // 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND 17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27 #include "examples.h" 28 29 #define BUF_SIZE (4096) 30 #define __ masm-> 31 32 // A vector by scalar multiply helper routine to generate code for 33 // the multiplication of each column of the resulting 4x4 matrix. 34 // This function provides a template for the following pattern: 35 // 36 // __ Fmul(v<v_out>.V4S(), v4.V4S(), v<s_column>.S(), 0); 37 // __ Fmla(v<v_out>.V4S(), v5.V4S(), v<s_column>.S(), 1); 38 // __ Fmla(v<v_out>.V4S(), v6.V4S(), v<s_column>.S(), 2); 39 // __ Fmla(v<v_out>.V4S(), v7.V4S(), v<s_column>.S(), 3); 40 // 41 // v<v_out> corresponds to a column of the output matrix (v0, v1, v2 or v3). 42 // v<s_column> corresponds to a column of the 2nd input (v16, v17, v18 or v19). 43 // 44 static void GenerateMultiplyColumn(MacroAssembler* masm, 45 unsigned out_column, 46 unsigned in_column) { 47 // 'v_out' splits a Q register into 4 lanes of 32 bits each. 48 VRegister v_out = VRegister(out_column, kQRegSize, 4); 49 // 'v_in' refers to a single 32 bit 'S' lane. 50 VRegister v_in = VRegister(in_column, kSRegSize); 51 52 __ Fmul(v_out, v4.V4S(), v_in, 0); // e.g. (v0.V4S(), v4.V4S(), v8.S(), 0). 53 __ Fmla(v_out, v5.V4S(), v_in, 1); 54 __ Fmla(v_out, v6.V4S(), v_in, 2); 55 __ Fmla(v_out, v7.V4S(), v_in, 3); 56 } 57 58 void GenerateNEONMatrixMultiply(MacroAssembler* masm) { 59 // Argument location: 60 // dst -> x0 61 // mat1 -> x1 62 // mat2 -> x2 63 64 Label end; 65 66 __ And(x3, x0, x1); 67 __ And(x3, x3, x2); 68 __ Cbz(x3, &end); // Nothing to do if an input is null. 69 70 // Load the first matrix into v4, v5, v6 and v7. 71 __ Ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1)); 72 // Load the first matrix into v16, v17, v18 and v19. 73 __ Ld1(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x2)); 74 75 // Initialise vectors of the output matrix with zeros. 76 // This is only for the purposes of showing how this can be achived 77 // but technically this is not required because we overwrite all lanes 78 // of the output vectors. 79 __ Movi(v0.V16B(), 0); 80 __ Movi(v1.V16B(), 0); 81 __ Movi(v2.V16B(), 0); 82 __ Movi(v3.V16B(), 0); 83 84 GenerateMultiplyColumn(masm, 0, 16); 85 GenerateMultiplyColumn(masm, 1, 17); 86 GenerateMultiplyColumn(masm, 2, 18); 87 GenerateMultiplyColumn(masm, 3, 19); 88 89 // Store the resulting matrix from v0, v1, v2 and v3. 90 __ St1(v0.V4S(), v1.V4S(), v2.V4S(), v3.V4S(), MemOperand(x0)); 91 92 __ Bind(&end); 93 __ Ret(); 94 } 95 96 97 #ifndef TEST_EXAMPLES 98 #ifdef USE_SIMULATOR 99 int main(void) { 100 // Create and initialize the assembler and the simulator. 101 byte assm_buf[BUF_SIZE]; 102 MacroAssembler masm(assm_buf, BUF_SIZE); 103 Decoder decoder; 104 Simulator simulator(&decoder); 105 106 // Generate the code for the example function. 107 Label neon_matrix_multiply; 108 masm.Bind(&neon_matrix_multiply); 109 GenerateNEONMatrixMultiply(&masm); 110 masm.FinalizeCode(); 111 112 // Define the required variables and run the example function. 113 const int kRowSize = 4; 114 const int kColSize = 4; 115 const int kLength = kRowSize * kColSize; 116 117 float mat1[kLength], mat2[kLength], output[kLength]; 118 119 // Initialise the output matrix to the zero matrix. 120 memset(output, 0, sizeof(output[0]) * kLength); 121 122 // Fill the two input matrices with some 32 bit floating point values. 123 // Array initialisation using curly brackets is also possible like so: 124 // float mat1[kLength] = { 1.0f, 52.03f, 4.43f, ... }; 125 // However, the following way better shows the "column-major" arrangement. 126 127 mat1[0] = 1.0f; mat1[4] = 2.0f; mat1[ 8] = 3.0f; mat1[12] = 4.0f; 128 mat1[1] = 52.03f; mat1[5] = 12.24f; mat1[ 9] = 53.56f; mat1[13] = 22.22f; 129 mat1[2] = 4.43f; mat1[6] = 5.00f; mat1[10] = 7.00f; mat1[14] = 3.11f; 130 mat1[3] = 43.47f; mat1[7] = 10.97f; mat1[11] = 37.78f; mat1[15] = 90.91f; 131 132 mat2[0] = 1.0f; mat2[4] = 11.24f; mat2[ 8] = 21.00f; mat2[12] = 21.31f; 133 mat2[1] = 2.0f; mat2[5] = 2.24f; mat2[ 9] = 8.56f; mat2[13] = 52.03f; 134 mat2[2] = 3.0f; mat2[6] = 51.00f; mat2[10] = 21.00f; mat2[14] = 33.11f; 135 mat2[3] = 4.0f; mat2[7] = 0.00f; mat2[11] = 84.00f; mat2[15] = 1.97f; 136 137 simulator.ResetState(); 138 simulator.set_xreg(0, reinterpret_cast<uintptr_t>(output)); 139 simulator.set_xreg(1, reinterpret_cast<uintptr_t>(mat1)); 140 simulator.set_xreg(2, reinterpret_cast<uintptr_t>(mat2)); 141 simulator.RunFrom(masm.GetLabelAddress<Instruction*>(&neon_matrix_multiply)); 142 143 // Print the 4x4 output matrix along with both 4x4 input matrices. 144 for (int i = 0; i < kRowSize; i++) { 145 printf("| %8.2f %8.2f %8.2f %8.2f | " 146 "| %8.2f %8.2f %8.2f %8.2f | " 147 "| %8.2f %8.2f %8.2f %8.2f |\n", 148 mat1[i], mat1[4+i], mat1[8+i], mat1[12+i], 149 mat2[i], mat2[4+i], mat2[8+i], mat2[12+i], 150 output[i], output[4+i], output[8+i], output[12+i]); 151 if (i == 0 || i == 2) { 152 printf("| | " 153 "| | " 154 "| |\n"); 155 } else if (i == 1) { 156 printf("| | x " 157 "| | = " 158 "| |\n"); 159 } 160 } 161 162 return 0; 163 } 164 #else 165 // Without the simulator there is nothing to test. 166 int main(void) { return 0; } 167 #endif // USE_SIMULATOR 168 #endif // TEST_EXAMPLES 169