1 #include <typeinfo> 2 #include <iostream> 3 #include <Eigen/Core> 4 #include "BenchTimer.h" 5 using namespace Eigen; 6 using namespace std; 7 8 template<typename T> 9 EIGEN_DONT_INLINE typename T::Scalar sqsumNorm(const T& v) 10 { 11 return v.norm(); 12 } 13 14 template<typename T> 15 EIGEN_DONT_INLINE typename T::Scalar hypotNorm(const T& v) 16 { 17 return v.hypotNorm(); 18 } 19 20 template<typename T> 21 EIGEN_DONT_INLINE typename T::Scalar blueNorm(const T& v) 22 { 23 return v.blueNorm(); 24 } 25 26 template<typename T> 27 EIGEN_DONT_INLINE typename T::Scalar lapackNorm(T& v) 28 { 29 typedef typename T::Scalar Scalar; 30 int n = v.size(); 31 Scalar scale = 0; 32 Scalar ssq = 1; 33 for (int i=0;i<n;++i) 34 { 35 Scalar ax = internal::abs(v.coeff(i)); 36 if (scale >= ax) 37 { 38 ssq += internal::abs2(ax/scale); 39 } 40 else 41 { 42 ssq = Scalar(1) + ssq * internal::abs2(scale/ax); 43 scale = ax; 44 } 45 } 46 return scale * internal::sqrt(ssq); 47 } 48 49 template<typename T> 50 EIGEN_DONT_INLINE typename T::Scalar twopassNorm(T& v) 51 { 52 typedef typename T::Scalar Scalar; 53 Scalar s = v.cwise().abs().maxCoeff(); 54 return s*(v/s).norm(); 55 } 56 57 template<typename T> 58 EIGEN_DONT_INLINE typename T::Scalar bl2passNorm(T& v) 59 { 60 return v.stableNorm(); 61 } 62 63 template<typename T> 64 EIGEN_DONT_INLINE typename T::Scalar divacNorm(T& v) 65 { 66 int n =v.size() / 2; 67 for (int i=0;i<n;++i) 68 v(i) = v(2*i)*v(2*i) + v(2*i+1)*v(2*i+1); 69 n = n/2; 70 while (n>0) 71 { 72 for (int i=0;i<n;++i) 73 v(i) = v(2*i) + v(2*i+1); 74 n = n/2; 75 } 76 return internal::sqrt(v(0)); 77 } 78 79 #ifdef EIGEN_VECTORIZE 80 Packet4f internal::plt(const Packet4f& a, Packet4f& b) { return _mm_cmplt_ps(a,b); } 81 Packet2d internal::plt(const Packet2d& a, Packet2d& b) { return _mm_cmplt_pd(a,b); } 82 83 Packet4f internal::pandnot(const Packet4f& a, Packet4f& b) { return _mm_andnot_ps(a,b); } 84 Packet2d internal::pandnot(const Packet2d& a, Packet2d& b) { return _mm_andnot_pd(a,b); } 85 #endif 86 87 template<typename T> 88 EIGEN_DONT_INLINE typename T::Scalar pblueNorm(const T& v) 89 { 90 #ifndef EIGEN_VECTORIZE 91 return v.blueNorm(); 92 #else 93 typedef typename T::Scalar Scalar; 94 95 static int nmax = 0; 96 static Scalar b1, b2, s1m, s2m, overfl, rbig, relerr; 97 int n; 98 99 if(nmax <= 0) 100 { 101 int nbig, ibeta, it, iemin, iemax, iexp; 102 Scalar abig, eps; 103 104 nbig = std::numeric_limits<int>::max(); // largest integer 105 ibeta = std::numeric_limits<Scalar>::radix; //NumTraits<Scalar>::Base; // base for floating-point numbers 106 it = std::numeric_limits<Scalar>::digits; //NumTraits<Scalar>::Mantissa; // number of base-beta digits in mantissa 107 iemin = std::numeric_limits<Scalar>::min_exponent; // minimum exponent 108 iemax = std::numeric_limits<Scalar>::max_exponent; // maximum exponent 109 rbig = std::numeric_limits<Scalar>::max(); // largest floating-point number 110 111 // Check the basic machine-dependent constants. 112 if(iemin > 1 - 2*it || 1+it>iemax || (it==2 && ibeta<5) 113 || (it<=4 && ibeta <= 3 ) || it<2) 114 { 115 eigen_assert(false && "the algorithm cannot be guaranteed on this computer"); 116 } 117 iexp = -((1-iemin)/2); 118 b1 = std::pow(ibeta, iexp); // lower boundary of midrange 119 iexp = (iemax + 1 - it)/2; 120 b2 = std::pow(ibeta,iexp); // upper boundary of midrange 121 122 iexp = (2-iemin)/2; 123 s1m = std::pow(ibeta,iexp); // scaling factor for lower range 124 iexp = - ((iemax+it)/2); 125 s2m = std::pow(ibeta,iexp); // scaling factor for upper range 126 127 overfl = rbig*s2m; // overfow boundary for abig 128 eps = std::pow(ibeta, 1-it); 129 relerr = internal::sqrt(eps); // tolerance for neglecting asml 130 abig = 1.0/eps - 1.0; 131 if (Scalar(nbig)>abig) nmax = abig; // largest safe n 132 else nmax = nbig; 133 } 134 135 typedef typename internal::packet_traits<Scalar>::type Packet; 136 const int ps = internal::packet_traits<Scalar>::size; 137 Packet pasml = internal::pset1(Scalar(0)); 138 Packet pamed = internal::pset1(Scalar(0)); 139 Packet pabig = internal::pset1(Scalar(0)); 140 Packet ps2m = internal::pset1(s2m); 141 Packet ps1m = internal::pset1(s1m); 142 Packet pb2 = internal::pset1(b2); 143 Packet pb1 = internal::pset1(b1); 144 for(int j=0; j<v.size(); j+=ps) 145 { 146 Packet ax = internal::pabs(v.template packet<Aligned>(j)); 147 Packet ax_s2m = internal::pmul(ax,ps2m); 148 Packet ax_s1m = internal::pmul(ax,ps1m); 149 Packet maskBig = internal::plt(pb2,ax); 150 Packet maskSml = internal::plt(ax,pb1); 151 152 // Packet maskMed = internal::pand(maskSml,maskBig); 153 // Packet scale = internal::pset1(Scalar(0)); 154 // scale = internal::por(scale, internal::pand(maskBig,ps2m)); 155 // scale = internal::por(scale, internal::pand(maskSml,ps1m)); 156 // scale = internal::por(scale, internal::pandnot(internal::pset1(Scalar(1)),maskMed)); 157 // ax = internal::pmul(ax,scale); 158 // ax = internal::pmul(ax,ax); 159 // pabig = internal::padd(pabig, internal::pand(maskBig, ax)); 160 // pasml = internal::padd(pasml, internal::pand(maskSml, ax)); 161 // pamed = internal::padd(pamed, internal::pandnot(ax,maskMed)); 162 163 164 pabig = internal::padd(pabig, internal::pand(maskBig, internal::pmul(ax_s2m,ax_s2m))); 165 pasml = internal::padd(pasml, internal::pand(maskSml, internal::pmul(ax_s1m,ax_s1m))); 166 pamed = internal::padd(pamed, internal::pandnot(internal::pmul(ax,ax),internal::pand(maskSml,maskBig))); 167 } 168 Scalar abig = internal::predux(pabig); 169 Scalar asml = internal::predux(pasml); 170 Scalar amed = internal::predux(pamed); 171 if(abig > Scalar(0)) 172 { 173 abig = internal::sqrt(abig); 174 if(abig > overfl) 175 { 176 eigen_assert(false && "overflow"); 177 return rbig; 178 } 179 if(amed > Scalar(0)) 180 { 181 abig = abig/s2m; 182 amed = internal::sqrt(amed); 183 } 184 else 185 { 186 return abig/s2m; 187 } 188 189 } 190 else if(asml > Scalar(0)) 191 { 192 if (amed > Scalar(0)) 193 { 194 abig = internal::sqrt(amed); 195 amed = internal::sqrt(asml) / s1m; 196 } 197 else 198 { 199 return internal::sqrt(asml)/s1m; 200 } 201 } 202 else 203 { 204 return internal::sqrt(amed); 205 } 206 asml = std::min(abig, amed); 207 abig = std::max(abig, amed); 208 if(asml <= abig*relerr) 209 return abig; 210 else 211 return abig * internal::sqrt(Scalar(1) + internal::abs2(asml/abig)); 212 #endif 213 } 214 215 #define BENCH_PERF(NRM) { \ 216 Eigen::BenchTimer tf, td, tcf; tf.reset(); td.reset(); tcf.reset();\ 217 for (int k=0; k<tries; ++k) { \ 218 tf.start(); \ 219 for (int i=0; i<iters; ++i) NRM(vf); \ 220 tf.stop(); \ 221 } \ 222 for (int k=0; k<tries; ++k) { \ 223 td.start(); \ 224 for (int i=0; i<iters; ++i) NRM(vd); \ 225 td.stop(); \ 226 } \ 227 for (int k=0; k<std::max(1,tries/3); ++k) { \ 228 tcf.start(); \ 229 for (int i=0; i<iters; ++i) NRM(vcf); \ 230 tcf.stop(); \ 231 } \ 232 std::cout << #NRM << "\t" << tf.value() << " " << td.value() << " " << tcf.value() << "\n"; \ 233 } 234 235 void check_accuracy(double basef, double based, int s) 236 { 237 double yf = basef * internal::abs(internal::random<double>()); 238 double yd = based * internal::abs(internal::random<double>()); 239 VectorXf vf = VectorXf::Ones(s) * yf; 240 VectorXd vd = VectorXd::Ones(s) * yd; 241 242 std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n"; 243 std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\n"; 244 std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\n"; 245 std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\n"; 246 std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\n"; 247 std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\n"; 248 std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\n"; 249 std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\n"; 250 } 251 252 void check_accuracy_var(int ef0, int ef1, int ed0, int ed1, int s) 253 { 254 VectorXf vf(s); 255 VectorXd vd(s); 256 for (int i=0; i<s; ++i) 257 { 258 vf[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ef0,ef1)); 259 vd[i] = internal::abs(internal::random<double>()) * std::pow(double(10), internal::random<int>(ed0,ed1)); 260 } 261 262 //std::cout << "reference\t" << internal::sqrt(double(s))*yf << "\t" << internal::sqrt(double(s))*yd << "\n"; 263 std::cout << "sqsumNorm\t" << sqsumNorm(vf) << "\t" << sqsumNorm(vd) << "\t" << sqsumNorm(vf.cast<long double>()) << "\t" << sqsumNorm(vd.cast<long double>()) << "\n"; 264 std::cout << "hypotNorm\t" << hypotNorm(vf) << "\t" << hypotNorm(vd) << "\t" << hypotNorm(vf.cast<long double>()) << "\t" << hypotNorm(vd.cast<long double>()) << "\n"; 265 std::cout << "blueNorm\t" << blueNorm(vf) << "\t" << blueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n"; 266 std::cout << "pblueNorm\t" << pblueNorm(vf) << "\t" << pblueNorm(vd) << "\t" << blueNorm(vf.cast<long double>()) << "\t" << blueNorm(vd.cast<long double>()) << "\n"; 267 std::cout << "lapackNorm\t" << lapackNorm(vf) << "\t" << lapackNorm(vd) << "\t" << lapackNorm(vf.cast<long double>()) << "\t" << lapackNorm(vd.cast<long double>()) << "\n"; 268 std::cout << "twopassNorm\t" << twopassNorm(vf) << "\t" << twopassNorm(vd) << "\t" << twopassNorm(vf.cast<long double>()) << "\t" << twopassNorm(vd.cast<long double>()) << "\n"; 269 // std::cout << "bl2passNorm\t" << bl2passNorm(vf) << "\t" << bl2passNorm(vd) << "\t" << bl2passNorm(vf.cast<long double>()) << "\t" << bl2passNorm(vd.cast<long double>()) << "\n"; 270 } 271 272 int main(int argc, char** argv) 273 { 274 int tries = 10; 275 int iters = 100000; 276 double y = 1.1345743233455785456788e12 * internal::random<double>(); 277 VectorXf v = VectorXf::Ones(1024) * y; 278 279 // return 0; 280 int s = 10000; 281 double basef_ok = 1.1345743233455785456788e15; 282 double based_ok = 1.1345743233455785456788e95; 283 284 double basef_under = 1.1345743233455785456788e-27; 285 double based_under = 1.1345743233455785456788e-303; 286 287 double basef_over = 1.1345743233455785456788e+27; 288 double based_over = 1.1345743233455785456788e+302; 289 290 std::cout.precision(20); 291 292 std::cerr << "\nNo under/overflow:\n"; 293 check_accuracy(basef_ok, based_ok, s); 294 295 std::cerr << "\nUnderflow:\n"; 296 check_accuracy(basef_under, based_under, s); 297 298 std::cerr << "\nOverflow:\n"; 299 check_accuracy(basef_over, based_over, s); 300 301 std::cerr << "\nVarying (over):\n"; 302 for (int k=0; k<1; ++k) 303 { 304 check_accuracy_var(20,27,190,302,s); 305 std::cout << "\n"; 306 } 307 308 std::cerr << "\nVarying (under):\n"; 309 for (int k=0; k<1; ++k) 310 { 311 check_accuracy_var(-27,20,-302,-190,s); 312 std::cout << "\n"; 313 } 314 315 std::cout.precision(4); 316 std::cerr << "Performance (out of cache):\n"; 317 { 318 int iters = 1; 319 VectorXf vf = VectorXf::Random(1024*1024*32) * y; 320 VectorXd vd = VectorXd::Random(1024*1024*32) * y; 321 VectorXcf vcf = VectorXcf::Random(1024*1024*32) * y; 322 BENCH_PERF(sqsumNorm); 323 BENCH_PERF(blueNorm); 324 // BENCH_PERF(pblueNorm); 325 // BENCH_PERF(lapackNorm); 326 // BENCH_PERF(hypotNorm); 327 // BENCH_PERF(twopassNorm); 328 BENCH_PERF(bl2passNorm); 329 } 330 331 std::cerr << "\nPerformance (in cache):\n"; 332 { 333 int iters = 100000; 334 VectorXf vf = VectorXf::Random(512) * y; 335 VectorXd vd = VectorXd::Random(512) * y; 336 VectorXcf vcf = VectorXcf::Random(512) * y; 337 BENCH_PERF(sqsumNorm); 338 BENCH_PERF(blueNorm); 339 // BENCH_PERF(pblueNorm); 340 // BENCH_PERF(lapackNorm); 341 // BENCH_PERF(hypotNorm); 342 // BENCH_PERF(twopassNorm); 343 BENCH_PERF(bl2passNorm); 344 } 345 } 346