// g++ -O3 -DNDEBUG -I.. -L /usr/lib64/atlas/ benchBlasGemm.cpp -o benchBlasGemm -lrt -lcblas // possible options: // -DEIGEN_DONT_VECTORIZE // -msse2 // #define EIGEN_DEFAULT_TO_ROW_MAJOR #define _FLOAT #include <iostream> #include <Eigen/Core> #include "BenchTimer.h" // include the BLAS headers extern "C" { #include <cblas.h> } #include <string> #ifdef _FLOAT typedef float Scalar; #define CBLAS_GEMM cblas_sgemm #else typedef double Scalar; #define CBLAS_GEMM cblas_dgemm #endif typedef Eigen::Matrix<Scalar,Eigen::Dynamic,Eigen::Dynamic> MyMatrix; void bench_eigengemm(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb, int nbloops); void check_product(int M, int N, int K); void check_product(void); int main(int argc, char *argv[]) { // disable SSE exceptions #ifdef __GNUC__ { int aux; asm( "stmxcsr %[aux] \n\t" "orl $32832, %[aux] \n\t" "ldmxcsr %[aux] \n\t" : : [aux] "m" (aux)); } #endif int nbtries=1, nbloops=1, M, N, K; if (argc==2) { if (std::string(argv[1])=="check") check_product(); else M = N = K = atoi(argv[1]); } else if ((argc==3) && (std::string(argv[1])=="auto")) { M = N = K = atoi(argv[2]); nbloops = 1000000000/(M*M*M); if (nbloops<1) nbloops = 1; nbtries = 6; } else if (argc==4) { M = N = K = atoi(argv[1]); nbloops = atoi(argv[2]); nbtries = atoi(argv[3]); } else if (argc==6) { M = atoi(argv[1]); N = atoi(argv[2]); K = atoi(argv[3]); nbloops = atoi(argv[4]); nbtries = atoi(argv[5]); } else { std::cout << "Usage: " << argv[0] << " size \n"; std::cout << "Usage: " << argv[0] << " auto size\n"; std::cout << "Usage: " << argv[0] << " size nbloops nbtries\n"; std::cout << "Usage: " << argv[0] << " M N K nbloops nbtries\n"; std::cout << "Usage: " << argv[0] << " check\n"; std::cout << "Options:\n"; std::cout << " size unique size of the 2 matrices (integer)\n"; std::cout << " auto automatically set the number of repetitions and tries\n"; std::cout << " nbloops number of times the GEMM routines is executed\n"; std::cout << " nbtries number of times the loop is benched (return the best try)\n"; std::cout << " M N K sizes of the matrices: MxN = MxK * KxN (integers)\n"; std::cout << " check check eigen product using cblas as a reference\n"; exit(1); } double nbmad = double(M) * double(N) * double(K) * double(nbloops); if (!(std::string(argv[1])=="auto")) std::cout << M << " x " << N << " x " << K << "\n"; Scalar alpha, beta; MyMatrix ma(M,K), mb(K,N), mc(M,N); ma = MyMatrix::Random(M,K); mb = MyMatrix::Random(K,N); mc = MyMatrix::Random(M,N); Eigen::BenchTimer timer; // we simply compute c += a*b, so: alpha = 1; beta = 1; // bench cblas // ROWS_A, COLS_B, COLS_A, 1.0, A, COLS_A, B, COLS_B, 0.0, C, COLS_B); if (!(std::string(argv[1])=="auto")) { timer.reset(); for (uint k=0 ; k<nbtries ; ++k) { timer.start(); for (uint j=0 ; j<nbloops ; ++j) #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR CBLAS_GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, ma.data(), K, mb.data(), N, beta, mc.data(), N); #else CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, ma.data(), M, mb.data(), K, beta, mc.data(), M); #endif timer.stop(); } if (!(std::string(argv[1])=="auto")) std::cout << "cblas: " << timer.value() << " (" << 1e-3*floor(1e-6*nbmad/timer.value()) << " GFlops/s)\n"; else std::cout << M << " : " << timer.value() << " ; " << 1e-3*floor(1e-6*nbmad/timer.value()) << "\n"; } // clear ma = MyMatrix::Random(M,K); mb = MyMatrix::Random(K,N); mc = MyMatrix::Random(M,N); // eigen // if (!(std::string(argv[1])=="auto")) { timer.reset(); for (uint k=0 ; k<nbtries ; ++k) { timer.start(); bench_eigengemm(mc, ma, mb, nbloops); timer.stop(); } if (!(std::string(argv[1])=="auto")) std::cout << "eigen : " << timer.value() << " (" << 1e-3*floor(1e-6*nbmad/timer.value()) << " GFlops/s)\n"; else std::cout << M << " : " << timer.value() << " ; " << 1e-3*floor(1e-6*nbmad/timer.value()) << "\n"; } std::cout << "l1: " << Eigen::l1CacheSize() << std::endl; std::cout << "l2: " << Eigen::l2CacheSize() << std::endl; return 0; } using namespace Eigen; void bench_eigengemm(MyMatrix& mc, const MyMatrix& ma, const MyMatrix& mb, int nbloops) { for (uint j=0 ; j<nbloops ; ++j) mc.noalias() += ma * mb; } #define MYVERIFY(A,M) if (!(A)) { \ std::cout << "FAIL: " << M << "\n"; \ } void check_product(int M, int N, int K) { MyMatrix ma(M,K), mb(K,N), mc(M,N), maT(K,M), mbT(N,K), meigen(M,N), mref(M,N); ma = MyMatrix::Random(M,K); mb = MyMatrix::Random(K,N); maT = ma.transpose(); mbT = mb.transpose(); mc = MyMatrix::Random(M,N); MyMatrix::Scalar eps = 1e-4; meigen = mref = mc; CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, ma.data(), M, mb.data(), K, 1, mref.data(), M); meigen += ma * mb; MYVERIFY(meigen.isApprox(mref, eps),". * ."); meigen = mref = mc; CBLAS_GEMM(CblasColMajor, CblasTrans, CblasNoTrans, M, N, K, 1, maT.data(), K, mb.data(), K, 1, mref.data(), M); meigen += maT.transpose() * mb; MYVERIFY(meigen.isApprox(mref, eps),"T * ."); meigen = mref = mc; CBLAS_GEMM(CblasColMajor, CblasTrans, CblasTrans, M, N, K, 1, maT.data(), K, mbT.data(), N, 1, mref.data(), M); meigen += (maT.transpose()) * (mbT.transpose()); MYVERIFY(meigen.isApprox(mref, eps),"T * T"); meigen = mref = mc; CBLAS_GEMM(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, 1, ma.data(), M, mbT.data(), N, 1, mref.data(), M); meigen += ma * mbT.transpose(); MYVERIFY(meigen.isApprox(mref, eps),". * T"); } void check_product(void) { int M, N, K; for (uint i=0; i<1000; ++i) { M = internal::random<int>(1,64); N = internal::random<int>(1,768); K = internal::random<int>(1,768); M = (0 + M) * 1; std::cout << M << " x " << N << " x " << K << "\n"; check_product(M, N, K); } }