1 #include <arm_neon.h> 2 3 struct Matrix43 { 4 float32x4_t row0; 5 float32x4_t row1; 6 float32x4_t row2; 7 float32x4_t row3; 8 }; 9 10 __attribute__((always_inline)) inline Matrix43 operator*(const Matrix43& m1, const Matrix43& m2) { 11 Matrix43 rr; 12 rr.row0 = vmulq_n_f32( m2.row0, vgetq_lane_f32(m1.row0, 0)); 13 rr.row0 = vmlaq_n_f32(rr.row0, m2.row1, vgetq_lane_f32(m1.row0, 1)); 14 rr.row0 = vmlaq_n_f32(rr.row0, m2.row2, vgetq_lane_f32(m1.row0, 2)); 15 16 rr.row1 = vmulq_n_f32( m2.row0, vgetq_lane_f32(m1.row1, 0)); 17 rr.row1 = vmlaq_n_f32(rr.row1, m2.row1, vgetq_lane_f32(m1.row1, 1)); 18 rr.row1 = vmlaq_n_f32(rr.row1, m2.row2, vgetq_lane_f32(m1.row1, 2)); 19 20 rr.row2 = vmulq_n_f32( m2.row0, vgetq_lane_f32(m1.row2, 0)); 21 rr.row2 = vmlaq_n_f32(rr.row2, m2.row1, vgetq_lane_f32(m1.row2, 1)); 22 rr.row2 = vmlaq_n_f32(rr.row2, m2.row2, vgetq_lane_f32(m1.row2, 2)); 23 24 rr.row3 = vmlaq_n_f32(m2.row3, m2.row0, vgetq_lane_f32(m1.row3, 0)); 25 rr.row3 = vmlaq_n_f32(rr.row3, m2.row1, vgetq_lane_f32(m1.row3, 1)); 26 rr.row3 = vmlaq_n_f32(rr.row3, m2.row2, vgetq_lane_f32(m1.row3, 2)); 27 return rr; 28 } 29 30 void _f_with_internal_compiler_error(const Matrix43& m, const void* a1, const void* a2) { 31 m * m * m; 32 } 33