1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud (at) inria.fr> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H 11 #define EIGEN_GENERAL_MATRIX_VECTOR_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 /* Optimized col-major matrix * vector product: 18 * This algorithm processes 4 columns at onces that allows to both reduce 19 * the number of load/stores of the result by a factor 4 and to reduce 20 * the instruction dependency. Moreover, we know that all bands have the 21 * same alignment pattern. 22 * 23 * Mixing type logic: C += alpha * A * B 24 * | A | B |alpha| comments 25 * |real |cplx |cplx | no vectorization 26 * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization 27 * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp 28 * |cplx |real |real | optimal case, vectorization possible via real-cplx mul 29 */ 30 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 31 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> 32 { 33 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 34 35 enum { 36 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable 37 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), 38 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 39 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 40 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 41 }; 42 43 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 44 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 45 typedef typename packet_traits<ResScalar>::type _ResPacket; 46 47 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 48 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 49 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 50 51 EIGEN_DONT_INLINE static void run( 52 Index rows, Index cols, 53 const LhsScalar* lhs, Index lhsStride, 54 const RhsScalar* rhs, Index rhsIncr, 55 ResScalar* res, Index 56 #ifdef EIGEN_INTERNAL_DEBUGGING 57 resIncr 58 #endif 59 , RhsScalar alpha) 60 { 61 eigen_internal_assert(resIncr==1); 62 #ifdef _EIGEN_ACCUMULATE_PACKETS 63 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 64 #endif 65 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ 66 pstore(&res[j], \ 67 padd(pload<ResPacket>(&res[j]), \ 68 padd( \ 69 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ 70 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ 71 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ 72 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) 73 74 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 75 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 76 if(ConjugateRhs) 77 alpha = conj(alpha); 78 79 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; 80 const Index columnsAtOnce = 4; 81 const Index peels = 2; 82 const Index LhsPacketAlignedMask = LhsPacketSize-1; 83 const Index ResPacketAlignedMask = ResPacketSize-1; 84 const Index PeelAlignedMask = ResPacketSize*peels-1; 85 const Index size = rows; 86 87 // How many coeffs of the result do we have to skip to be aligned. 88 // Here we assume data are at least aligned on the base scalar type. 89 Index alignedStart = internal::first_aligned(res,size); 90 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; 91 const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; 92 93 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; 94 Index alignmentPattern = alignmentStep==0 ? AllAligned 95 : alignmentStep==(LhsPacketSize/2) ? EvenAligned 96 : FirstAligned; 97 98 // we cannot assume the first element is aligned because of sub-matrices 99 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); 100 101 // find how many columns do we have to skip to be aligned with the result (if possible) 102 Index skipColumns = 0; 103 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) 104 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) 105 { 106 alignedSize = 0; 107 alignedStart = 0; 108 } 109 else if (LhsPacketSize>1) 110 { 111 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); 112 113 while (skipColumns<LhsPacketSize && 114 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) 115 ++skipColumns; 116 if (skipColumns==LhsPacketSize) 117 { 118 // nothing can be aligned, no need to skip any column 119 alignmentPattern = NoneAligned; 120 skipColumns = 0; 121 } 122 else 123 { 124 skipColumns = (std::min)(skipColumns,cols); 125 // note that the skiped columns are processed later. 126 } 127 128 eigen_internal_assert( (alignmentPattern==NoneAligned) 129 || (skipColumns + columnsAtOnce >= cols) 130 || LhsPacketSize > size 131 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); 132 } 133 else if(Vectorizable) 134 { 135 alignedStart = 0; 136 alignedSize = size; 137 alignmentPattern = AllAligned; 138 } 139 140 Index offset1 = (FirstAligned && alignmentStep==1?3:1); 141 Index offset3 = (FirstAligned && alignmentStep==1?1:3); 142 143 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; 144 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) 145 { 146 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), 147 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), 148 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), 149 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); 150 151 // this helps a lot generating better binary code 152 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, 153 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; 154 155 if (Vectorizable) 156 { 157 /* explicit vectorization */ 158 // process initial unaligned coeffs 159 for (Index j=0; j<alignedStart; ++j) 160 { 161 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); 162 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); 163 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); 164 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); 165 } 166 167 if (alignedSize>alignedStart) 168 { 169 switch(alignmentPattern) 170 { 171 case AllAligned: 172 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 173 _EIGEN_ACCUMULATE_PACKETS(d,d,d); 174 break; 175 case EvenAligned: 176 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 177 _EIGEN_ACCUMULATE_PACKETS(d,du,d); 178 break; 179 case FirstAligned: 180 if(peels>1) 181 { 182 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; 183 ResPacket T0, T1; 184 185 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); 186 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); 187 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); 188 189 for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize) 190 { 191 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); 192 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); 193 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); 194 195 A00 = pload<LhsPacket>(&lhs0[j]); 196 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); 197 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); 198 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); 199 200 T0 = pcj.pmadd(A01, ptmp1, T0); 201 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); 202 T0 = pcj.pmadd(A02, ptmp2, T0); 203 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); 204 T0 = pcj.pmadd(A03, ptmp3, T0); 205 pstore(&res[j],T0); 206 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); 207 T1 = pcj.pmadd(A11, ptmp1, T1); 208 T1 = pcj.pmadd(A12, ptmp2, T1); 209 T1 = pcj.pmadd(A13, ptmp3, T1); 210 pstore(&res[j+ResPacketSize],T1); 211 } 212 } 213 for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize) 214 _EIGEN_ACCUMULATE_PACKETS(d,du,du); 215 break; 216 default: 217 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) 218 _EIGEN_ACCUMULATE_PACKETS(du,du,du); 219 break; 220 } 221 } 222 } // end explicit vectorization 223 224 /* process remaining coeffs (or all if there is no explicit vectorization) */ 225 for (Index j=alignedSize; j<size; ++j) 226 { 227 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); 228 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); 229 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); 230 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); 231 } 232 } 233 234 // process remaining first and last columns (at most columnsAtOnce-1) 235 Index end = cols; 236 Index start = columnBound; 237 do 238 { 239 for (Index k=start; k<end; ++k) 240 { 241 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); 242 const LhsScalar* lhs0 = lhs + k*lhsStride; 243 244 if (Vectorizable) 245 { 246 /* explicit vectorization */ 247 // process first unaligned result's coeffs 248 for (Index j=0; j<alignedStart; ++j) 249 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); 250 // process aligned result's coeffs 251 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) 252 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) 253 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); 254 else 255 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) 256 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); 257 } 258 259 // process remaining scalars (or all if no explicit vectorization) 260 for (Index i=alignedSize; i<size; ++i) 261 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); 262 } 263 if (skipColumns) 264 { 265 start = 0; 266 end = skipColumns; 267 skipColumns = 0; 268 } 269 else 270 break; 271 } while(Vectorizable); 272 #undef _EIGEN_ACCUMULATE_PACKETS 273 } 274 }; 275 276 /* Optimized row-major matrix * vector product: 277 * This algorithm processes 4 rows at onces that allows to both reduce 278 * the number of load/stores of the result by a factor 4 and to reduce 279 * the instruction dependency. Moreover, we know that all bands have the 280 * same alignment pattern. 281 * 282 * Mixing type logic: 283 * - alpha is always a complex (or converted to a complex) 284 * - no vectorization 285 */ 286 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> 287 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> 288 { 289 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 290 291 enum { 292 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable 293 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), 294 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 295 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 296 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 297 }; 298 299 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 300 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 301 typedef typename packet_traits<ResScalar>::type _ResPacket; 302 303 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 304 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 305 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 306 307 EIGEN_DONT_INLINE static void run( 308 Index rows, Index cols, 309 const LhsScalar* lhs, Index lhsStride, 310 const RhsScalar* rhs, Index rhsIncr, 311 ResScalar* res, Index resIncr, 312 ResScalar alpha) 313 { 314 EIGEN_UNUSED_VARIABLE(rhsIncr); 315 eigen_internal_assert(rhsIncr==1); 316 #ifdef _EIGEN_ACCUMULATE_PACKETS 317 #error _EIGEN_ACCUMULATE_PACKETS has already been defined 318 #endif 319 320 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ 321 RhsPacket b = pload<RhsPacket>(&rhs[j]); \ 322 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ 323 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ 324 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ 325 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } 326 327 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 328 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 329 330 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; 331 const Index rowsAtOnce = 4; 332 const Index peels = 2; 333 const Index RhsPacketAlignedMask = RhsPacketSize-1; 334 const Index LhsPacketAlignedMask = LhsPacketSize-1; 335 const Index PeelAlignedMask = RhsPacketSize*peels-1; 336 const Index depth = cols; 337 338 // How many coeffs of the result do we have to skip to be aligned. 339 // Here we assume data are at least aligned on the base scalar type 340 // if that's not the case then vectorization is discarded, see below. 341 Index alignedStart = internal::first_aligned(rhs, depth); 342 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; 343 const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart; 344 345 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; 346 Index alignmentPattern = alignmentStep==0 ? AllAligned 347 : alignmentStep==(LhsPacketSize/2) ? EvenAligned 348 : FirstAligned; 349 350 // we cannot assume the first element is aligned because of sub-matrices 351 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); 352 353 // find how many rows do we have to skip to be aligned with rhs (if possible) 354 Index skipRows = 0; 355 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) 356 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) 357 { 358 alignedSize = 0; 359 alignedStart = 0; 360 } 361 else if (LhsPacketSize>1) 362 { 363 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); 364 365 while (skipRows<LhsPacketSize && 366 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) 367 ++skipRows; 368 if (skipRows==LhsPacketSize) 369 { 370 // nothing can be aligned, no need to skip any column 371 alignmentPattern = NoneAligned; 372 skipRows = 0; 373 } 374 else 375 { 376 skipRows = (std::min)(skipRows,Index(rows)); 377 // note that the skiped columns are processed later. 378 } 379 eigen_internal_assert( alignmentPattern==NoneAligned 380 || LhsPacketSize==1 381 || (skipRows + rowsAtOnce >= rows) 382 || LhsPacketSize > depth 383 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); 384 } 385 else if(Vectorizable) 386 { 387 alignedStart = 0; 388 alignedSize = depth; 389 alignmentPattern = AllAligned; 390 } 391 392 Index offset1 = (FirstAligned && alignmentStep==1?3:1); 393 Index offset3 = (FirstAligned && alignmentStep==1?1:3); 394 395 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; 396 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) 397 { 398 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); 399 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); 400 401 // this helps the compiler generating good binary code 402 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, 403 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; 404 405 if (Vectorizable) 406 { 407 /* explicit vectorization */ 408 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)), 409 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0)); 410 411 // process initial unaligned coeffs 412 // FIXME this loop get vectorized by the compiler ! 413 for (Index j=0; j<alignedStart; ++j) 414 { 415 RhsScalar b = rhs[j]; 416 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); 417 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); 418 } 419 420 if (alignedSize>alignedStart) 421 { 422 switch(alignmentPattern) 423 { 424 case AllAligned: 425 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 426 _EIGEN_ACCUMULATE_PACKETS(d,d,d); 427 break; 428 case EvenAligned: 429 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 430 _EIGEN_ACCUMULATE_PACKETS(d,du,d); 431 break; 432 case FirstAligned: 433 if (peels>1) 434 { 435 /* Here we proccess 4 rows with with two peeled iterations to hide 436 * tghe overhead of unaligned loads. Moreover unaligned loads are handled 437 * using special shift/move operations between the two aligned packets 438 * overlaping the desired unaligned packet. This is *much* more efficient 439 * than basic unaligned loads. 440 */ 441 LhsPacket A01, A02, A03, A11, A12, A13; 442 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); 443 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); 444 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); 445 446 for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize) 447 { 448 RhsPacket b = pload<RhsPacket>(&rhs[j]); 449 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); 450 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); 451 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); 452 453 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); 454 ptmp1 = pcj.pmadd(A01, b, ptmp1); 455 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); 456 ptmp2 = pcj.pmadd(A02, b, ptmp2); 457 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); 458 ptmp3 = pcj.pmadd(A03, b, ptmp3); 459 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); 460 461 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); 462 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); 463 ptmp1 = pcj.pmadd(A11, b, ptmp1); 464 ptmp2 = pcj.pmadd(A12, b, ptmp2); 465 ptmp3 = pcj.pmadd(A13, b, ptmp3); 466 } 467 } 468 for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize) 469 _EIGEN_ACCUMULATE_PACKETS(d,du,du); 470 break; 471 default: 472 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) 473 _EIGEN_ACCUMULATE_PACKETS(du,du,du); 474 break; 475 } 476 tmp0 += predux(ptmp0); 477 tmp1 += predux(ptmp1); 478 tmp2 += predux(ptmp2); 479 tmp3 += predux(ptmp3); 480 } 481 } // end explicit vectorization 482 483 // process remaining coeffs (or all if no explicit vectorization) 484 // FIXME this loop get vectorized by the compiler ! 485 for (Index j=alignedSize; j<depth; ++j) 486 { 487 RhsScalar b = rhs[j]; 488 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); 489 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); 490 } 491 res[i*resIncr] += alpha*tmp0; 492 res[(i+offset1)*resIncr] += alpha*tmp1; 493 res[(i+2)*resIncr] += alpha*tmp2; 494 res[(i+offset3)*resIncr] += alpha*tmp3; 495 } 496 497 // process remaining first and last rows (at most columnsAtOnce-1) 498 Index end = rows; 499 Index start = rowBound; 500 do 501 { 502 for (Index i=start; i<end; ++i) 503 { 504 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); 505 ResPacket ptmp0 = pset1<ResPacket>(tmp0); 506 const LhsScalar* lhs0 = lhs + i*lhsStride; 507 // process first unaligned result's coeffs 508 // FIXME this loop get vectorized by the compiler ! 509 for (Index j=0; j<alignedStart; ++j) 510 tmp0 += cj.pmul(lhs0[j], rhs[j]); 511 512 if (alignedSize>alignedStart) 513 { 514 // process aligned rhs coeffs 515 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) 516 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) 517 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); 518 else 519 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) 520 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); 521 tmp0 += predux(ptmp0); 522 } 523 524 // process remaining scalars 525 // FIXME this loop get vectorized by the compiler ! 526 for (Index j=alignedSize; j<depth; ++j) 527 tmp0 += cj.pmul(lhs0[j], rhs[j]); 528 res[i*resIncr] += alpha*tmp0; 529 } 530 if (skipRows) 531 { 532 start = 0; 533 end = skipRows; 534 skipRows = 0; 535 } 536 else 537 break; 538 } while(Vectorizable); 539 540 #undef _EIGEN_ACCUMULATE_PACKETS 541 } 542 }; 543 544 } // end namespace internal 545 546 } // end namespace Eigen 547 548 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H 549