1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud (at) inria.fr> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H 11 #define EIGEN_GENERAL_BLOCK_PANEL_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> 18 class gebp_traits; 19 20 21 /** \internal \returns b if a<=0, and returns a otherwise. */ 22 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) 23 { 24 return a<=0 ? b : a; 25 } 26 27 /** \internal */ 28 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) 29 { 30 static std::ptrdiff_t m_l1CacheSize = 0; 31 static std::ptrdiff_t m_l2CacheSize = 0; 32 if(m_l2CacheSize==0) 33 { 34 m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); 35 m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); 36 } 37 38 if(action==SetAction) 39 { 40 // set the cpu cache size and cache all block sizes from a global cache size in byte 41 eigen_internal_assert(l1!=0 && l2!=0); 42 m_l1CacheSize = *l1; 43 m_l2CacheSize = *l2; 44 } 45 else if(action==GetAction) 46 { 47 eigen_internal_assert(l1!=0 && l2!=0); 48 *l1 = m_l1CacheSize; 49 *l2 = m_l2CacheSize; 50 } 51 else 52 { 53 eigen_internal_assert(false); 54 } 55 } 56 57 /** \brief Computes the blocking parameters for a m x k times k x n matrix product 58 * 59 * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. 60 * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension. 61 * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension. 62 * 63 * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, 64 * this function computes the blocking size parameters along the respective dimensions 65 * for matrix products and related algorithms. The blocking sizes depends on various 66 * parameters: 67 * - the L1 and L2 cache sizes, 68 * - the register level blocking sizes defined by gebp_traits, 69 * - the number of scalars that fit into a packet (when vectorization is enabled). 70 * 71 * \sa setCpuCacheSizes */ 72 template<typename LhsScalar, typename RhsScalar, int KcFactor> 73 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) 74 { 75 EIGEN_UNUSED_VARIABLE(n); 76 // Explanations: 77 // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and 78 // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed 79 // per kc x nr vertical small panels where nr is the blocking size along the n dimension 80 // at the register level. For vectorization purpose, these small vertical panels are unpacked, 81 // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to 82 // stay in L1 cache. 83 std::ptrdiff_t l1, l2; 84 85 typedef gebp_traits<LhsScalar,RhsScalar> Traits; 86 enum { 87 kdiv = KcFactor * 2 * Traits::nr 88 * Traits::RhsProgress * sizeof(RhsScalar), 89 mr = gebp_traits<LhsScalar,RhsScalar>::mr, 90 mr_mask = (0xffffffff/mr)*mr 91 }; 92 93 manage_caching_sizes(GetAction, &l1, &l2); 94 k = std::min<std::ptrdiff_t>(k, l1/kdiv); 95 std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; 96 if(_m<m) m = _m & mr_mask; 97 } 98 99 template<typename LhsScalar, typename RhsScalar> 100 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n) 101 { 102 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); 103 } 104 105 #ifdef EIGEN_HAS_FUSE_CJMADD 106 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); 107 #else 108 109 // FIXME (a bit overkill maybe ?) 110 111 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector { 112 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) 113 { 114 c = cj.pmadd(a,b,c); 115 } 116 }; 117 118 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> { 119 EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) 120 { 121 t = b; t = cj.pmul(a,t); c = padd(c,t); 122 } 123 }; 124 125 template<typename CJ, typename A, typename B, typename C, typename T> 126 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) 127 { 128 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); 129 } 130 131 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); 132 // #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); 133 #endif 134 135 /* Vectorization logic 136 * real*real: unpack rhs to constant packets, ... 137 * 138 * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), 139 * storing each res packet into two packets (2x2), 140 * at the end combine them: swap the second and addsub them 141 * cf*cf : same but with 2x4 blocks 142 * cplx*real : unpack rhs to constant packets, ... 143 * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual 144 */ 145 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> 146 class gebp_traits 147 { 148 public: 149 typedef _LhsScalar LhsScalar; 150 typedef _RhsScalar RhsScalar; 151 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 152 153 enum { 154 ConjLhs = _ConjLhs, 155 ConjRhs = _ConjRhs, 156 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, 157 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 158 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 159 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 160 161 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 162 163 // register block size along the N direction (must be either 2 or 4) 164 nr = NumberOfRegisters/4, 165 166 // register block size along the M direction (currently, this one cannot be modified) 167 mr = 2 * LhsPacketSize, 168 169 WorkSpaceFactor = nr * RhsPacketSize, 170 171 LhsProgress = LhsPacketSize, 172 RhsProgress = RhsPacketSize 173 }; 174 175 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 176 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 177 typedef typename packet_traits<ResScalar>::type _ResPacket; 178 179 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 180 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 181 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 182 183 typedef ResPacket AccPacket; 184 185 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 186 { 187 p = pset1<ResPacket>(ResScalar(0)); 188 } 189 190 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 191 { 192 for(DenseIndex k=0; k<n; k++) 193 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 194 } 195 196 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 197 { 198 dest = pload<RhsPacket>(b); 199 } 200 201 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 202 { 203 dest = pload<LhsPacket>(a); 204 } 205 206 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const 207 { 208 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); 209 } 210 211 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 212 { 213 r = pmadd(c,alpha,r); 214 } 215 216 protected: 217 // conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; 218 // conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; 219 }; 220 221 template<typename RealScalar, bool _ConjLhs> 222 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> 223 { 224 public: 225 typedef std::complex<RealScalar> LhsScalar; 226 typedef RealScalar RhsScalar; 227 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; 228 229 enum { 230 ConjLhs = _ConjLhs, 231 ConjRhs = false, 232 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, 233 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 234 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 235 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 236 237 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 238 nr = NumberOfRegisters/4, 239 mr = 2 * LhsPacketSize, 240 WorkSpaceFactor = nr*RhsPacketSize, 241 242 LhsProgress = LhsPacketSize, 243 RhsProgress = RhsPacketSize 244 }; 245 246 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 247 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 248 typedef typename packet_traits<ResScalar>::type _ResPacket; 249 250 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 251 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 252 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 253 254 typedef ResPacket AccPacket; 255 256 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 257 { 258 p = pset1<ResPacket>(ResScalar(0)); 259 } 260 261 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 262 { 263 for(DenseIndex k=0; k<n; k++) 264 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 265 } 266 267 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 268 { 269 dest = pload<RhsPacket>(b); 270 } 271 272 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 273 { 274 dest = pload<LhsPacket>(a); 275 } 276 277 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const 278 { 279 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); 280 } 281 282 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const 283 { 284 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); 285 } 286 287 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const 288 { 289 c += a * b; 290 } 291 292 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 293 { 294 r = cj.pmadd(c,alpha,r); 295 } 296 297 protected: 298 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; 299 }; 300 301 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> 302 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > 303 { 304 public: 305 typedef std::complex<RealScalar> Scalar; 306 typedef std::complex<RealScalar> LhsScalar; 307 typedef std::complex<RealScalar> RhsScalar; 308 typedef std::complex<RealScalar> ResScalar; 309 310 enum { 311 ConjLhs = _ConjLhs, 312 ConjRhs = _ConjRhs, 313 Vectorizable = packet_traits<RealScalar>::Vectorizable 314 && packet_traits<Scalar>::Vectorizable, 315 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, 316 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 317 318 nr = 2, 319 mr = 2 * ResPacketSize, 320 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, 321 322 LhsProgress = ResPacketSize, 323 RhsProgress = Vectorizable ? 2*ResPacketSize : 1 324 }; 325 326 typedef typename packet_traits<RealScalar>::type RealPacket; 327 typedef typename packet_traits<Scalar>::type ScalarPacket; 328 struct DoublePacket 329 { 330 RealPacket first; 331 RealPacket second; 332 }; 333 334 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; 335 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; 336 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; 337 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; 338 339 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } 340 341 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) 342 { 343 p.first = pset1<RealPacket>(RealScalar(0)); 344 p.second = pset1<RealPacket>(RealScalar(0)); 345 } 346 347 /* Unpack the rhs coeff such that each complex coefficient is spread into 348 * two packects containing respectively the real and imaginary coefficient 349 * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] 350 */ 351 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) 352 { 353 for(DenseIndex k=0; k<n; k++) 354 { 355 if(Vectorizable) 356 { 357 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); 358 pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); 359 } 360 else 361 b[k] = rhs[k]; 362 } 363 } 364 365 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } 366 367 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const 368 { 369 dest.first = pload<RealPacket>((const RealScalar*)b); 370 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); 371 } 372 373 // nothing special here 374 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 375 { 376 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); 377 } 378 379 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const 380 { 381 c.first = padd(pmul(a,b.first), c.first); 382 c.second = padd(pmul(a,b.second),c.second); 383 } 384 385 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const 386 { 387 c = cj.pmadd(a,b,c); 388 } 389 390 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } 391 392 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const 393 { 394 // assemble c 395 ResPacket tmp; 396 if((!ConjLhs)&&(!ConjRhs)) 397 { 398 tmp = pcplxflip(pconj(ResPacket(c.second))); 399 tmp = padd(ResPacket(c.first),tmp); 400 } 401 else if((!ConjLhs)&&(ConjRhs)) 402 { 403 tmp = pconj(pcplxflip(ResPacket(c.second))); 404 tmp = padd(ResPacket(c.first),tmp); 405 } 406 else if((ConjLhs)&&(!ConjRhs)) 407 { 408 tmp = pcplxflip(ResPacket(c.second)); 409 tmp = padd(pconj(ResPacket(c.first)),tmp); 410 } 411 else if((ConjLhs)&&(ConjRhs)) 412 { 413 tmp = pcplxflip(ResPacket(c.second)); 414 tmp = psub(pconj(ResPacket(c.first)),tmp); 415 } 416 417 r = pmadd(tmp,alpha,r); 418 } 419 420 protected: 421 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; 422 }; 423 424 template<typename RealScalar, bool _ConjRhs> 425 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > 426 { 427 public: 428 typedef std::complex<RealScalar> Scalar; 429 typedef RealScalar LhsScalar; 430 typedef Scalar RhsScalar; 431 typedef Scalar ResScalar; 432 433 enum { 434 ConjLhs = false, 435 ConjRhs = _ConjRhs, 436 Vectorizable = packet_traits<RealScalar>::Vectorizable 437 && packet_traits<Scalar>::Vectorizable, 438 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, 439 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, 440 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, 441 442 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, 443 nr = 4, 444 mr = 2*ResPacketSize, 445 WorkSpaceFactor = nr*RhsPacketSize, 446 447 LhsProgress = ResPacketSize, 448 RhsProgress = ResPacketSize 449 }; 450 451 typedef typename packet_traits<LhsScalar>::type _LhsPacket; 452 typedef typename packet_traits<RhsScalar>::type _RhsPacket; 453 typedef typename packet_traits<ResScalar>::type _ResPacket; 454 455 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 456 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 457 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 458 459 typedef ResPacket AccPacket; 460 461 EIGEN_STRONG_INLINE void initAcc(AccPacket& p) 462 { 463 p = pset1<ResPacket>(ResScalar(0)); 464 } 465 466 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) 467 { 468 for(DenseIndex k=0; k<n; k++) 469 pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); 470 } 471 472 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const 473 { 474 dest = pload<RhsPacket>(b); 475 } 476 477 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const 478 { 479 dest = ploaddup<LhsPacket>(a); 480 } 481 482 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const 483 { 484 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); 485 } 486 487 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const 488 { 489 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); 490 } 491 492 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const 493 { 494 c += a * b; 495 } 496 497 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const 498 { 499 r = cj.pmadd(alpha,c,r); 500 } 501 502 protected: 503 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; 504 }; 505 506 /* optimized GEneral packed Block * packed Panel product kernel 507 * 508 * Mixing type logic: C += A * B 509 * | A | B | comments 510 * |real |cplx | no vectorization yet, would require to pack A with duplication 511 * |cplx |real | easy vectorization 512 */ 513 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> 514 struct gebp_kernel 515 { 516 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; 517 typedef typename Traits::ResScalar ResScalar; 518 typedef typename Traits::LhsPacket LhsPacket; 519 typedef typename Traits::RhsPacket RhsPacket; 520 typedef typename Traits::ResPacket ResPacket; 521 typedef typename Traits::AccPacket AccPacket; 522 523 enum { 524 Vectorizable = Traits::Vectorizable, 525 LhsProgress = Traits::LhsProgress, 526 RhsProgress = Traits::RhsProgress, 527 ResPacketSize = Traits::ResPacketSize 528 }; 529 530 EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB 531 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, 532 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0) 533 { 534 Traits traits; 535 536 if(strideA==-1) strideA = depth; 537 if(strideB==-1) strideB = depth; 538 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 539 // conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 540 Index packet_cols = (cols/nr) * nr; 541 const Index peeled_mc = (rows/mr)*mr; 542 // FIXME: 543 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); 544 const Index peeled_kc = (depth/4)*4; 545 546 if(unpackedB==0) 547 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); 548 549 // loops on each micro vertical panel of rhs (depth x nr) 550 for(Index j2=0; j2<packet_cols; j2+=nr) 551 { 552 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 553 554 // loops on each largest micro horizontal panel of lhs (mr x depth) 555 // => we select a mr x nr micro block of res which is entirely 556 // stored into mr/packet_size x nr registers. 557 for(Index i=0; i<peeled_mc; i+=mr) 558 { 559 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; 560 prefetch(&blA[0]); 561 562 // gets res block as register 563 AccPacket C0, C1, C2, C3, C4, C5, C6, C7; 564 traits.initAcc(C0); 565 traits.initAcc(C1); 566 if(nr==4) traits.initAcc(C2); 567 if(nr==4) traits.initAcc(C3); 568 traits.initAcc(C4); 569 traits.initAcc(C5); 570 if(nr==4) traits.initAcc(C6); 571 if(nr==4) traits.initAcc(C7); 572 573 ResScalar* r0 = &res[(j2+0)*resStride + i]; 574 ResScalar* r1 = r0 + resStride; 575 ResScalar* r2 = r1 + resStride; 576 ResScalar* r3 = r2 + resStride; 577 578 prefetch(r0+16); 579 prefetch(r1+16); 580 prefetch(r2+16); 581 prefetch(r3+16); 582 583 // performs "inner" product 584 // TODO let's check wether the folowing peeled loop could not be 585 // optimized via optimal prefetching from one loop to the other 586 const RhsScalar* blB = unpackedB; 587 for(Index k=0; k<peeled_kc; k+=4) 588 { 589 if(nr==2) 590 { 591 LhsPacket A0, A1; 592 RhsPacket B_0; 593 RhsPacket T0; 594 595 EIGEN_ASM_COMMENT("mybegin2"); 596 traits.loadLhs(&blA[0*LhsProgress], A0); 597 traits.loadLhs(&blA[1*LhsProgress], A1); 598 traits.loadRhs(&blB[0*RhsProgress], B_0); 599 traits.madd(A0,B_0,C0,T0); 600 traits.madd(A1,B_0,C4,B_0); 601 traits.loadRhs(&blB[1*RhsProgress], B_0); 602 traits.madd(A0,B_0,C1,T0); 603 traits.madd(A1,B_0,C5,B_0); 604 605 traits.loadLhs(&blA[2*LhsProgress], A0); 606 traits.loadLhs(&blA[3*LhsProgress], A1); 607 traits.loadRhs(&blB[2*RhsProgress], B_0); 608 traits.madd(A0,B_0,C0,T0); 609 traits.madd(A1,B_0,C4,B_0); 610 traits.loadRhs(&blB[3*RhsProgress], B_0); 611 traits.madd(A0,B_0,C1,T0); 612 traits.madd(A1,B_0,C5,B_0); 613 614 traits.loadLhs(&blA[4*LhsProgress], A0); 615 traits.loadLhs(&blA[5*LhsProgress], A1); 616 traits.loadRhs(&blB[4*RhsProgress], B_0); 617 traits.madd(A0,B_0,C0,T0); 618 traits.madd(A1,B_0,C4,B_0); 619 traits.loadRhs(&blB[5*RhsProgress], B_0); 620 traits.madd(A0,B_0,C1,T0); 621 traits.madd(A1,B_0,C5,B_0); 622 623 traits.loadLhs(&blA[6*LhsProgress], A0); 624 traits.loadLhs(&blA[7*LhsProgress], A1); 625 traits.loadRhs(&blB[6*RhsProgress], B_0); 626 traits.madd(A0,B_0,C0,T0); 627 traits.madd(A1,B_0,C4,B_0); 628 traits.loadRhs(&blB[7*RhsProgress], B_0); 629 traits.madd(A0,B_0,C1,T0); 630 traits.madd(A1,B_0,C5,B_0); 631 EIGEN_ASM_COMMENT("myend"); 632 } 633 else 634 { 635 EIGEN_ASM_COMMENT("mybegin4"); 636 LhsPacket A0, A1; 637 RhsPacket B_0, B1, B2, B3; 638 RhsPacket T0; 639 640 traits.loadLhs(&blA[0*LhsProgress], A0); 641 traits.loadLhs(&blA[1*LhsProgress], A1); 642 traits.loadRhs(&blB[0*RhsProgress], B_0); 643 traits.loadRhs(&blB[1*RhsProgress], B1); 644 645 traits.madd(A0,B_0,C0,T0); 646 traits.loadRhs(&blB[2*RhsProgress], B2); 647 traits.madd(A1,B_0,C4,B_0); 648 traits.loadRhs(&blB[3*RhsProgress], B3); 649 traits.loadRhs(&blB[4*RhsProgress], B_0); 650 traits.madd(A0,B1,C1,T0); 651 traits.madd(A1,B1,C5,B1); 652 traits.loadRhs(&blB[5*RhsProgress], B1); 653 traits.madd(A0,B2,C2,T0); 654 traits.madd(A1,B2,C6,B2); 655 traits.loadRhs(&blB[6*RhsProgress], B2); 656 traits.madd(A0,B3,C3,T0); 657 traits.loadLhs(&blA[2*LhsProgress], A0); 658 traits.madd(A1,B3,C7,B3); 659 traits.loadLhs(&blA[3*LhsProgress], A1); 660 traits.loadRhs(&blB[7*RhsProgress], B3); 661 traits.madd(A0,B_0,C0,T0); 662 traits.madd(A1,B_0,C4,B_0); 663 traits.loadRhs(&blB[8*RhsProgress], B_0); 664 traits.madd(A0,B1,C1,T0); 665 traits.madd(A1,B1,C5,B1); 666 traits.loadRhs(&blB[9*RhsProgress], B1); 667 traits.madd(A0,B2,C2,T0); 668 traits.madd(A1,B2,C6,B2); 669 traits.loadRhs(&blB[10*RhsProgress], B2); 670 traits.madd(A0,B3,C3,T0); 671 traits.loadLhs(&blA[4*LhsProgress], A0); 672 traits.madd(A1,B3,C7,B3); 673 traits.loadLhs(&blA[5*LhsProgress], A1); 674 traits.loadRhs(&blB[11*RhsProgress], B3); 675 676 traits.madd(A0,B_0,C0,T0); 677 traits.madd(A1,B_0,C4,B_0); 678 traits.loadRhs(&blB[12*RhsProgress], B_0); 679 traits.madd(A0,B1,C1,T0); 680 traits.madd(A1,B1,C5,B1); 681 traits.loadRhs(&blB[13*RhsProgress], B1); 682 traits.madd(A0,B2,C2,T0); 683 traits.madd(A1,B2,C6,B2); 684 traits.loadRhs(&blB[14*RhsProgress], B2); 685 traits.madd(A0,B3,C3,T0); 686 traits.loadLhs(&blA[6*LhsProgress], A0); 687 traits.madd(A1,B3,C7,B3); 688 traits.loadLhs(&blA[7*LhsProgress], A1); 689 traits.loadRhs(&blB[15*RhsProgress], B3); 690 traits.madd(A0,B_0,C0,T0); 691 traits.madd(A1,B_0,C4,B_0); 692 traits.madd(A0,B1,C1,T0); 693 traits.madd(A1,B1,C5,B1); 694 traits.madd(A0,B2,C2,T0); 695 traits.madd(A1,B2,C6,B2); 696 traits.madd(A0,B3,C3,T0); 697 traits.madd(A1,B3,C7,B3); 698 } 699 700 blB += 4*nr*RhsProgress; 701 blA += 4*mr; 702 } 703 // process remaining peeled loop 704 for(Index k=peeled_kc; k<depth; k++) 705 { 706 if(nr==2) 707 { 708 LhsPacket A0, A1; 709 RhsPacket B_0; 710 RhsPacket T0; 711 712 traits.loadLhs(&blA[0*LhsProgress], A0); 713 traits.loadLhs(&blA[1*LhsProgress], A1); 714 traits.loadRhs(&blB[0*RhsProgress], B_0); 715 traits.madd(A0,B_0,C0,T0); 716 traits.madd(A1,B_0,C4,B_0); 717 traits.loadRhs(&blB[1*RhsProgress], B_0); 718 traits.madd(A0,B_0,C1,T0); 719 traits.madd(A1,B_0,C5,B_0); 720 } 721 else 722 { 723 LhsPacket A0, A1; 724 RhsPacket B_0, B1, B2, B3; 725 RhsPacket T0; 726 727 traits.loadLhs(&blA[0*LhsProgress], A0); 728 traits.loadLhs(&blA[1*LhsProgress], A1); 729 traits.loadRhs(&blB[0*RhsProgress], B_0); 730 traits.loadRhs(&blB[1*RhsProgress], B1); 731 732 traits.madd(A0,B_0,C0,T0); 733 traits.loadRhs(&blB[2*RhsProgress], B2); 734 traits.madd(A1,B_0,C4,B_0); 735 traits.loadRhs(&blB[3*RhsProgress], B3); 736 traits.madd(A0,B1,C1,T0); 737 traits.madd(A1,B1,C5,B1); 738 traits.madd(A0,B2,C2,T0); 739 traits.madd(A1,B2,C6,B2); 740 traits.madd(A0,B3,C3,T0); 741 traits.madd(A1,B3,C7,B3); 742 } 743 744 blB += nr*RhsProgress; 745 blA += mr; 746 } 747 748 if(nr==4) 749 { 750 ResPacket R0, R1, R2, R3, R4, R5, R6; 751 ResPacket alphav = pset1<ResPacket>(alpha); 752 753 R0 = ploadu<ResPacket>(r0); 754 R1 = ploadu<ResPacket>(r1); 755 R2 = ploadu<ResPacket>(r2); 756 R3 = ploadu<ResPacket>(r3); 757 R4 = ploadu<ResPacket>(r0 + ResPacketSize); 758 R5 = ploadu<ResPacket>(r1 + ResPacketSize); 759 R6 = ploadu<ResPacket>(r2 + ResPacketSize); 760 traits.acc(C0, alphav, R0); 761 pstoreu(r0, R0); 762 R0 = ploadu<ResPacket>(r3 + ResPacketSize); 763 764 traits.acc(C1, alphav, R1); 765 traits.acc(C2, alphav, R2); 766 traits.acc(C3, alphav, R3); 767 traits.acc(C4, alphav, R4); 768 traits.acc(C5, alphav, R5); 769 traits.acc(C6, alphav, R6); 770 traits.acc(C7, alphav, R0); 771 772 pstoreu(r1, R1); 773 pstoreu(r2, R2); 774 pstoreu(r3, R3); 775 pstoreu(r0 + ResPacketSize, R4); 776 pstoreu(r1 + ResPacketSize, R5); 777 pstoreu(r2 + ResPacketSize, R6); 778 pstoreu(r3 + ResPacketSize, R0); 779 } 780 else 781 { 782 ResPacket R0, R1, R4; 783 ResPacket alphav = pset1<ResPacket>(alpha); 784 785 R0 = ploadu<ResPacket>(r0); 786 R1 = ploadu<ResPacket>(r1); 787 R4 = ploadu<ResPacket>(r0 + ResPacketSize); 788 traits.acc(C0, alphav, R0); 789 pstoreu(r0, R0); 790 R0 = ploadu<ResPacket>(r1 + ResPacketSize); 791 traits.acc(C1, alphav, R1); 792 traits.acc(C4, alphav, R4); 793 traits.acc(C5, alphav, R0); 794 pstoreu(r1, R1); 795 pstoreu(r0 + ResPacketSize, R4); 796 pstoreu(r1 + ResPacketSize, R0); 797 } 798 799 } 800 801 if(rows-peeled_mc>=LhsProgress) 802 { 803 Index i = peeled_mc; 804 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; 805 prefetch(&blA[0]); 806 807 // gets res block as register 808 AccPacket C0, C1, C2, C3; 809 traits.initAcc(C0); 810 traits.initAcc(C1); 811 if(nr==4) traits.initAcc(C2); 812 if(nr==4) traits.initAcc(C3); 813 814 // performs "inner" product 815 const RhsScalar* blB = unpackedB; 816 for(Index k=0; k<peeled_kc; k+=4) 817 { 818 if(nr==2) 819 { 820 LhsPacket A0; 821 RhsPacket B_0, B1; 822 823 traits.loadLhs(&blA[0*LhsProgress], A0); 824 traits.loadRhs(&blB[0*RhsProgress], B_0); 825 traits.loadRhs(&blB[1*RhsProgress], B1); 826 traits.madd(A0,B_0,C0,B_0); 827 traits.loadRhs(&blB[2*RhsProgress], B_0); 828 traits.madd(A0,B1,C1,B1); 829 traits.loadLhs(&blA[1*LhsProgress], A0); 830 traits.loadRhs(&blB[3*RhsProgress], B1); 831 traits.madd(A0,B_0,C0,B_0); 832 traits.loadRhs(&blB[4*RhsProgress], B_0); 833 traits.madd(A0,B1,C1,B1); 834 traits.loadLhs(&blA[2*LhsProgress], A0); 835 traits.loadRhs(&blB[5*RhsProgress], B1); 836 traits.madd(A0,B_0,C0,B_0); 837 traits.loadRhs(&blB[6*RhsProgress], B_0); 838 traits.madd(A0,B1,C1,B1); 839 traits.loadLhs(&blA[3*LhsProgress], A0); 840 traits.loadRhs(&blB[7*RhsProgress], B1); 841 traits.madd(A0,B_0,C0,B_0); 842 traits.madd(A0,B1,C1,B1); 843 } 844 else 845 { 846 LhsPacket A0; 847 RhsPacket B_0, B1, B2, B3; 848 849 traits.loadLhs(&blA[0*LhsProgress], A0); 850 traits.loadRhs(&blB[0*RhsProgress], B_0); 851 traits.loadRhs(&blB[1*RhsProgress], B1); 852 853 traits.madd(A0,B_0,C0,B_0); 854 traits.loadRhs(&blB[2*RhsProgress], B2); 855 traits.loadRhs(&blB[3*RhsProgress], B3); 856 traits.loadRhs(&blB[4*RhsProgress], B_0); 857 traits.madd(A0,B1,C1,B1); 858 traits.loadRhs(&blB[5*RhsProgress], B1); 859 traits.madd(A0,B2,C2,B2); 860 traits.loadRhs(&blB[6*RhsProgress], B2); 861 traits.madd(A0,B3,C3,B3); 862 traits.loadLhs(&blA[1*LhsProgress], A0); 863 traits.loadRhs(&blB[7*RhsProgress], B3); 864 traits.madd(A0,B_0,C0,B_0); 865 traits.loadRhs(&blB[8*RhsProgress], B_0); 866 traits.madd(A0,B1,C1,B1); 867 traits.loadRhs(&blB[9*RhsProgress], B1); 868 traits.madd(A0,B2,C2,B2); 869 traits.loadRhs(&blB[10*RhsProgress], B2); 870 traits.madd(A0,B3,C3,B3); 871 traits.loadLhs(&blA[2*LhsProgress], A0); 872 traits.loadRhs(&blB[11*RhsProgress], B3); 873 874 traits.madd(A0,B_0,C0,B_0); 875 traits.loadRhs(&blB[12*RhsProgress], B_0); 876 traits.madd(A0,B1,C1,B1); 877 traits.loadRhs(&blB[13*RhsProgress], B1); 878 traits.madd(A0,B2,C2,B2); 879 traits.loadRhs(&blB[14*RhsProgress], B2); 880 traits.madd(A0,B3,C3,B3); 881 882 traits.loadLhs(&blA[3*LhsProgress], A0); 883 traits.loadRhs(&blB[15*RhsProgress], B3); 884 traits.madd(A0,B_0,C0,B_0); 885 traits.madd(A0,B1,C1,B1); 886 traits.madd(A0,B2,C2,B2); 887 traits.madd(A0,B3,C3,B3); 888 } 889 890 blB += nr*4*RhsProgress; 891 blA += 4*LhsProgress; 892 } 893 // process remaining peeled loop 894 for(Index k=peeled_kc; k<depth; k++) 895 { 896 if(nr==2) 897 { 898 LhsPacket A0; 899 RhsPacket B_0, B1; 900 901 traits.loadLhs(&blA[0*LhsProgress], A0); 902 traits.loadRhs(&blB[0*RhsProgress], B_0); 903 traits.loadRhs(&blB[1*RhsProgress], B1); 904 traits.madd(A0,B_0,C0,B_0); 905 traits.madd(A0,B1,C1,B1); 906 } 907 else 908 { 909 LhsPacket A0; 910 RhsPacket B_0, B1, B2, B3; 911 912 traits.loadLhs(&blA[0*LhsProgress], A0); 913 traits.loadRhs(&blB[0*RhsProgress], B_0); 914 traits.loadRhs(&blB[1*RhsProgress], B1); 915 traits.loadRhs(&blB[2*RhsProgress], B2); 916 traits.loadRhs(&blB[3*RhsProgress], B3); 917 918 traits.madd(A0,B_0,C0,B_0); 919 traits.madd(A0,B1,C1,B1); 920 traits.madd(A0,B2,C2,B2); 921 traits.madd(A0,B3,C3,B3); 922 } 923 924 blB += nr*RhsProgress; 925 blA += LhsProgress; 926 } 927 928 ResPacket R0, R1, R2, R3; 929 ResPacket alphav = pset1<ResPacket>(alpha); 930 931 ResScalar* r0 = &res[(j2+0)*resStride + i]; 932 ResScalar* r1 = r0 + resStride; 933 ResScalar* r2 = r1 + resStride; 934 ResScalar* r3 = r2 + resStride; 935 936 R0 = ploadu<ResPacket>(r0); 937 R1 = ploadu<ResPacket>(r1); 938 if(nr==4) R2 = ploadu<ResPacket>(r2); 939 if(nr==4) R3 = ploadu<ResPacket>(r3); 940 941 traits.acc(C0, alphav, R0); 942 traits.acc(C1, alphav, R1); 943 if(nr==4) traits.acc(C2, alphav, R2); 944 if(nr==4) traits.acc(C3, alphav, R3); 945 946 pstoreu(r0, R0); 947 pstoreu(r1, R1); 948 if(nr==4) pstoreu(r2, R2); 949 if(nr==4) pstoreu(r3, R3); 950 } 951 for(Index i=peeled_mc2; i<rows; i++) 952 { 953 const LhsScalar* blA = &blockA[i*strideA+offsetA]; 954 prefetch(&blA[0]); 955 956 // gets a 1 x nr res block as registers 957 ResScalar C0(0), C1(0), C2(0), C3(0); 958 // TODO directly use blockB ??? 959 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; 960 for(Index k=0; k<depth; k++) 961 { 962 if(nr==2) 963 { 964 LhsScalar A0; 965 RhsScalar B_0, B1; 966 967 A0 = blA[k]; 968 B_0 = blB[0]; 969 B1 = blB[1]; 970 MADD(cj,A0,B_0,C0,B_0); 971 MADD(cj,A0,B1,C1,B1); 972 } 973 else 974 { 975 LhsScalar A0; 976 RhsScalar B_0, B1, B2, B3; 977 978 A0 = blA[k]; 979 B_0 = blB[0]; 980 B1 = blB[1]; 981 B2 = blB[2]; 982 B3 = blB[3]; 983 984 MADD(cj,A0,B_0,C0,B_0); 985 MADD(cj,A0,B1,C1,B1); 986 MADD(cj,A0,B2,C2,B2); 987 MADD(cj,A0,B3,C3,B3); 988 } 989 990 blB += nr; 991 } 992 res[(j2+0)*resStride + i] += alpha*C0; 993 res[(j2+1)*resStride + i] += alpha*C1; 994 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; 995 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; 996 } 997 } 998 // process remaining rhs/res columns one at a time 999 // => do the same but with nr==1 1000 for(Index j2=packet_cols; j2<cols; j2++) 1001 { 1002 // unpack B 1003 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); 1004 1005 for(Index i=0; i<peeled_mc; i+=mr) 1006 { 1007 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; 1008 prefetch(&blA[0]); 1009 1010 // TODO move the res loads to the stores 1011 1012 // get res block as registers 1013 AccPacket C0, C4; 1014 traits.initAcc(C0); 1015 traits.initAcc(C4); 1016 1017 const RhsScalar* blB = unpackedB; 1018 for(Index k=0; k<depth; k++) 1019 { 1020 LhsPacket A0, A1; 1021 RhsPacket B_0; 1022 RhsPacket T0; 1023 1024 traits.loadLhs(&blA[0*LhsProgress], A0); 1025 traits.loadLhs(&blA[1*LhsProgress], A1); 1026 traits.loadRhs(&blB[0*RhsProgress], B_0); 1027 traits.madd(A0,B_0,C0,T0); 1028 traits.madd(A1,B_0,C4,B_0); 1029 1030 blB += RhsProgress; 1031 blA += 2*LhsProgress; 1032 } 1033 ResPacket R0, R4; 1034 ResPacket alphav = pset1<ResPacket>(alpha); 1035 1036 ResScalar* r0 = &res[(j2+0)*resStride + i]; 1037 1038 R0 = ploadu<ResPacket>(r0); 1039 R4 = ploadu<ResPacket>(r0+ResPacketSize); 1040 1041 traits.acc(C0, alphav, R0); 1042 traits.acc(C4, alphav, R4); 1043 1044 pstoreu(r0, R0); 1045 pstoreu(r0+ResPacketSize, R4); 1046 } 1047 if(rows-peeled_mc>=LhsProgress) 1048 { 1049 Index i = peeled_mc; 1050 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; 1051 prefetch(&blA[0]); 1052 1053 AccPacket C0; 1054 traits.initAcc(C0); 1055 1056 const RhsScalar* blB = unpackedB; 1057 for(Index k=0; k<depth; k++) 1058 { 1059 LhsPacket A0; 1060 RhsPacket B_0; 1061 traits.loadLhs(blA, A0); 1062 traits.loadRhs(blB, B_0); 1063 traits.madd(A0, B_0, C0, B_0); 1064 blB += RhsProgress; 1065 blA += LhsProgress; 1066 } 1067 1068 ResPacket alphav = pset1<ResPacket>(alpha); 1069 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); 1070 traits.acc(C0, alphav, R0); 1071 pstoreu(&res[(j2+0)*resStride + i], R0); 1072 } 1073 for(Index i=peeled_mc2; i<rows; i++) 1074 { 1075 const LhsScalar* blA = &blockA[i*strideA+offsetA]; 1076 prefetch(&blA[0]); 1077 1078 // gets a 1 x 1 res block as registers 1079 ResScalar C0(0); 1080 // FIXME directly use blockB ?? 1081 const RhsScalar* blB = &blockB[j2*strideB+offsetB]; 1082 for(Index k=0; k<depth; k++) 1083 { 1084 LhsScalar A0 = blA[k]; 1085 RhsScalar B_0 = blB[k]; 1086 MADD(cj, A0, B_0, C0, B_0); 1087 } 1088 res[(j2+0)*resStride + i] += alpha*C0; 1089 } 1090 } 1091 } 1092 }; 1093 1094 #undef CJMADD 1095 1096 // pack a block of the lhs 1097 // The traversal is as follow (mr==4): 1098 // 0 4 8 12 ... 1099 // 1 5 9 13 ... 1100 // 2 6 10 14 ... 1101 // 3 7 11 15 ... 1102 // 1103 // 16 20 24 28 ... 1104 // 17 21 25 29 ... 1105 // 18 22 26 30 ... 1106 // 19 23 27 31 ... 1107 // 1108 // 32 33 34 35 ... 1109 // 36 36 38 39 ... 1110 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> 1111 struct gemm_pack_lhs 1112 { 1113 EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, 1114 Index stride=0, Index offset=0) 1115 { 1116 typedef typename packet_traits<Scalar>::type Packet; 1117 enum { PacketSize = packet_traits<Scalar>::size }; 1118 1119 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); 1120 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 1121 eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); 1122 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 1123 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); 1124 Index count = 0; 1125 Index peeled_mc = (rows/Pack1)*Pack1; 1126 for(Index i=0; i<peeled_mc; i+=Pack1) 1127 { 1128 if(PanelMode) count += Pack1 * offset; 1129 1130 if(StorageOrder==ColMajor) 1131 { 1132 for(Index k=0; k<depth; k++) 1133 { 1134 Packet A, B, C, D; 1135 if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); 1136 if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); 1137 if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); 1138 if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); 1139 if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } 1140 if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } 1141 if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } 1142 if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } 1143 } 1144 } 1145 else 1146 { 1147 for(Index k=0; k<depth; k++) 1148 { 1149 // TODO add a vectorized transpose here 1150 Index w=0; 1151 for(; w<Pack1-3; w+=4) 1152 { 1153 Scalar a(cj(lhs(i+w+0, k))), 1154 b(cj(lhs(i+w+1, k))), 1155 c(cj(lhs(i+w+2, k))), 1156 d(cj(lhs(i+w+3, k))); 1157 blockA[count++] = a; 1158 blockA[count++] = b; 1159 blockA[count++] = c; 1160 blockA[count++] = d; 1161 } 1162 if(Pack1%4) 1163 for(;w<Pack1;++w) 1164 blockA[count++] = cj(lhs(i+w, k)); 1165 } 1166 } 1167 if(PanelMode) count += Pack1 * (stride-offset-depth); 1168 } 1169 if(rows-peeled_mc>=Pack2) 1170 { 1171 if(PanelMode) count += Pack2*offset; 1172 for(Index k=0; k<depth; k++) 1173 for(Index w=0; w<Pack2; w++) 1174 blockA[count++] = cj(lhs(peeled_mc+w, k)); 1175 if(PanelMode) count += Pack2 * (stride-offset-depth); 1176 peeled_mc += Pack2; 1177 } 1178 for(Index i=peeled_mc; i<rows; i++) 1179 { 1180 if(PanelMode) count += offset; 1181 for(Index k=0; k<depth; k++) 1182 blockA[count++] = cj(lhs(i, k)); 1183 if(PanelMode) count += (stride-offset-depth); 1184 } 1185 } 1186 }; 1187 1188 // copy a complete panel of the rhs 1189 // this version is optimized for column major matrices 1190 // The traversal order is as follow: (nr==4): 1191 // 0 1 2 3 12 13 14 15 24 27 1192 // 4 5 6 7 16 17 18 19 25 28 1193 // 8 9 10 11 20 21 22 23 26 29 1194 // . . . . . . . . . . 1195 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 1196 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> 1197 { 1198 typedef typename packet_traits<Scalar>::type Packet; 1199 enum { PacketSize = packet_traits<Scalar>::size }; 1200 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, 1201 Index stride=0, Index offset=0) 1202 { 1203 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); 1204 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 1205 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 1206 Index packet_cols = (cols/nr) * nr; 1207 Index count = 0; 1208 for(Index j2=0; j2<packet_cols; j2+=nr) 1209 { 1210 // skip what we have before 1211 if(PanelMode) count += nr * offset; 1212 const Scalar* b0 = &rhs[(j2+0)*rhsStride]; 1213 const Scalar* b1 = &rhs[(j2+1)*rhsStride]; 1214 const Scalar* b2 = &rhs[(j2+2)*rhsStride]; 1215 const Scalar* b3 = &rhs[(j2+3)*rhsStride]; 1216 for(Index k=0; k<depth; k++) 1217 { 1218 blockB[count+0] = cj(b0[k]); 1219 blockB[count+1] = cj(b1[k]); 1220 if(nr==4) blockB[count+2] = cj(b2[k]); 1221 if(nr==4) blockB[count+3] = cj(b3[k]); 1222 count += nr; 1223 } 1224 // skip what we have after 1225 if(PanelMode) count += nr * (stride-offset-depth); 1226 } 1227 1228 // copy the remaining columns one at a time (nr==1) 1229 for(Index j2=packet_cols; j2<cols; ++j2) 1230 { 1231 if(PanelMode) count += offset; 1232 const Scalar* b0 = &rhs[(j2+0)*rhsStride]; 1233 for(Index k=0; k<depth; k++) 1234 { 1235 blockB[count] = cj(b0[k]); 1236 count += 1; 1237 } 1238 if(PanelMode) count += (stride-offset-depth); 1239 } 1240 } 1241 }; 1242 1243 // this version is optimized for row major matrices 1244 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> 1245 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> 1246 { 1247 enum { PacketSize = packet_traits<Scalar>::size }; 1248 EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, 1249 Index stride=0, Index offset=0) 1250 { 1251 EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); 1252 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); 1253 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; 1254 Index packet_cols = (cols/nr) * nr; 1255 Index count = 0; 1256 for(Index j2=0; j2<packet_cols; j2+=nr) 1257 { 1258 // skip what we have before 1259 if(PanelMode) count += nr * offset; 1260 for(Index k=0; k<depth; k++) 1261 { 1262 const Scalar* b0 = &rhs[k*rhsStride + j2]; 1263 blockB[count+0] = cj(b0[0]); 1264 blockB[count+1] = cj(b0[1]); 1265 if(nr==4) blockB[count+2] = cj(b0[2]); 1266 if(nr==4) blockB[count+3] = cj(b0[3]); 1267 count += nr; 1268 } 1269 // skip what we have after 1270 if(PanelMode) count += nr * (stride-offset-depth); 1271 } 1272 // copy the remaining columns one at a time (nr==1) 1273 for(Index j2=packet_cols; j2<cols; ++j2) 1274 { 1275 if(PanelMode) count += offset; 1276 const Scalar* b0 = &rhs[j2]; 1277 for(Index k=0; k<depth; k++) 1278 { 1279 blockB[count] = cj(b0[k*rhsStride]); 1280 count += 1; 1281 } 1282 if(PanelMode) count += stride-offset-depth; 1283 } 1284 } 1285 }; 1286 1287 } // end namespace internal 1288 1289 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. 1290 * \sa setCpuCacheSize */ 1291 inline std::ptrdiff_t l1CacheSize() 1292 { 1293 std::ptrdiff_t l1, l2; 1294 internal::manage_caching_sizes(GetAction, &l1, &l2); 1295 return l1; 1296 } 1297 1298 /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters. 1299 * \sa setCpuCacheSize */ 1300 inline std::ptrdiff_t l2CacheSize() 1301 { 1302 std::ptrdiff_t l1, l2; 1303 internal::manage_caching_sizes(GetAction, &l1, &l2); 1304 return l2; 1305 } 1306 1307 /** Set the cpu L1 and L2 cache sizes (in bytes). 1308 * These values are use to adjust the size of the blocks 1309 * for the algorithms working per blocks. 1310 * 1311 * \sa computeProductBlockingSizes */ 1312 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) 1313 { 1314 internal::manage_caching_sizes(SetAction, &l1, &l2); 1315 } 1316 1317 } // end namespace Eigen 1318 1319 #endif // EIGEN_GENERAL_BLOCK_PANEL_H 1320