Multi-threading benchmarking issues

Question

I just recently wrote an answer to a similar question SO: Eigen library with C++11 multithreading.

As I’m interested in this topic too and already had working code at hand, I adapted that sample to OP’s task of matrix multiplication:

test-multi-threading-matrix.cc:

#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <thread>
#include <vector>

template <typename VALUE>
class MatrixT {
  public:
    typedef VALUE Value;
  private:
    size_t _nRows, _nCols;
    std::vector<Value> _values;
  public:
    MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
      _nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
    { }
    ~MatrixT() = default;

    size_t getNumCols() const { return _nCols; }
    size_t getNumRows() const { return _nRows; }
    Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
    const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};

template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
  const size_t n = mat1.getNumCols();
  assert(n == mat2.getNumRows());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
  return sum;
}

typedef MatrixT<double> Matrix;

typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;

Time duration(const Clock::time_point &t0)
{
  return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}

Matrix populate(size_t dim)
{
  Matrix mat(dim, dim);
  for (size_t i = 0; i < dim; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
    }
  }
  return mat;
}

std::vector<Time> makeTest(size_t dim)
{
  const size_t NThreads = std::thread::hardware_concurrency();
  const size_t nThreads = std::min(dim * dim, NThreads);
  // make a test sample
  const Matrix sampleA = populate(dim);
  const Matrix sampleB = populate(dim);
  // prepare result vectors
  Matrix results4[4] = {
    Matrix(dim, dim),
    Matrix(dim, dim),
    Matrix(dim, dim),
    Matrix(dim, dim)
  };
  // make test
  std::vector<Time> times{
    [&]() { // single threading
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[0];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment single-threaded
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dot(a, i, b, j);
      }
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - stupid aproach
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[1];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (size_t k = 0, n = dim * dim; k < n;) {
        size_t nT = 0;
        for (; nT < nThreads && k < n; ++nT, ++k) {
          const size_t i = k / dim, j = k % dim;
          threads[nT] = std::move(std::thread(
            [i, j, &results, &a, &b]() {
              results[i][j] = dot(a, i, b, j);
            }));
        }
        for (size_t iT = 0; iT < nT; ++iT) threads[iT].join();
      }
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - interleaved
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[2];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (Value iT = 0; iT < nThreads; ++iT) {
        threads[iT] = std::move(std::thread(
          [iT, dim, &results, &a, &b, nThreads]() {
            for (size_t k = iT, n = dim * dim; k < n; k += nThreads) {
              const size_t i = k / dim, j = k % dim;
              results[i][j] = dot(a, i, b, j);
            }
          }));
      }
      for (std::thread &threadI : threads) threadI.join();
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - grouped
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[3];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (size_t iT = 0; iT < nThreads; ++iT) {
        threads[iT] = std::move(std::thread(
          [iT, dim, &results, &a, &b, nThreads]() {
            const size_t n = dim * dim;
            for (size_t k = iT * n / nThreads, kN = (iT + 1) * n / nThreads;
              k < kN; ++k) {
              const size_t i = k / dim, j = k % dim;
              results[i][j] = dot(a, i, b, j);
            }
          }));
      }
      for (std::thread &threadI : threads) threadI.join();
      // done
      return duration(t0);
    }()
  };
  // check results (must be equal for any kind of computation)
  const unsigned nResults = sizeof results4 / sizeof *results4;
  for (unsigned iResult = 1; iResult < nResults; ++iResult) {
    size_t nErrors = 0;
    for (size_t i = 0; i < dim; ++i) {
      for (size_t j = 0; j < dim; ++j) {
        if (results4[0][i][j] != results4[iResult][i][j]) {
          ++nErrors;
#if 0 // def _DEBUG
          std::cerr
            << "results4[0][" << i << "]["  << j << "]: "
            << results4[0][i][j]
            << " != results4[" << iResult << "][" << i << "][" << j << "]: "
            << results4[iResult][i][j]
            << "!\n";
#endif // _DEBUG
        }
      }
    }
    if (nErrors) std::cerr << nErrors << " errors in results4[" << iResult << "]!\n";
  }
  // done
  return times;
}

int main()
{
  std::cout << "std::thread::hardware_concurrency(): "
    << std::thread::hardware_concurrency() << '\n';
  // heat up
  std::cout << "Heat up...\n";
  for (unsigned i = 0; i < 10; ++i) makeTest(64);
  // perform tests:
  const unsigned NTrials = 10;
  for (size_t dim = 64; dim <= 512; dim *= 2) {
    std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
    // repeat NTrials times
    std::cout << "Measuring " << NTrials << " runs...\n"
      << "   "
      << " | " << std::setw(10) << "Single"
      << " | " << std::setw(10) << "Multi 1"
      << " | " << std::setw(10) << "Multi 2"
      << " | " << std::setw(10) << "Multi 3"
      << '\n';
    std::vector<double> sumTimes;
    for (unsigned i = 0; i < NTrials; ++i) {
      std::vector<Time> times = makeTest(dim);
      std::cout << std::setw(2) << (i + 1) << ".";
      for (const Time &time : times) {
        std::cout << " | " << std::setw(10) << time.count();
      }
      std::cout << '\n';
      sumTimes.resize(times.size(), 0.0);
      for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
    }
    std::cout << "Average Values:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(1)
        << sumTime / NTrials;
    }
    std::cout << '\n';
    std::cout << "Ratio:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(3)
        << sumTime / sumTimes.front();
    }
    std::cout << "\n\n";
  }
  // done
  return 0;
}

In my first tests, I started with 2×2 matrices, and doubled the number of rows and columns for each test series ending with 64×64 matrices.

I soon came to the same conclusion as Mike: these matrices are much too small. The overhead for setting up and joining threads consumes any speed-up which might’ve been gained by concurrency. So, I modified the test series starting with 64×64 matrices and ending with 512×512.

I compiled and run on cygwin64 (on Windows 10):

$ g++ --version
g++ (GCC) 7.3.0

$ g++ -std=c++17 -O2 test-multi-threading-matrix.cc -o test-multi-threading-matrix

$ ./test-multi-threading-matrix
std::thread::hardware_concurrency(): 8
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |        417 |     482837 |       1068 |       1080
 2. |        403 |     486775 |       1034 |       1225
 3. |        289 |     482578 |       1478 |       1151
 4. |        282 |     502703 |       1103 |       1081
 5. |        398 |     495351 |       1287 |       1124
 6. |        404 |     501426 |       1050 |       1017
 7. |        402 |     483517 |       1000 |        980
 8. |        271 |     498591 |       1092 |       1047
 9. |        284 |     494732 |        984 |       1057
10. |        288 |     494738 |       1050 |       1116
Average Values:
    |      343.8 |   492324.8 |     1114.6 |     1087.8
Ratio:
    |      1.000 |   1432.009 |      3.242 |      3.164

Test for A[128][128] * B[128][128]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |       2282 |    1995527 |       2215 |       1574
 2. |       3076 |    1954316 |       1644 |       1679
 3. |       2952 |    1981908 |       2572 |       2250
 4. |       2119 |    1986365 |       1568 |       1462
 5. |       2676 |    2212344 |       1615 |       1657
 6. |       2396 |    1981545 |       1776 |       1593
 7. |       2513 |    1983718 |       1950 |       1580
 8. |       2614 |    1852414 |       1737 |       1670
 9. |       2148 |    1955587 |       1805 |       1609
10. |       2161 |    1980772 |       1794 |       1826
Average Values:
    |     2493.7 |  1988449.6 |     1867.6 |     1690.0
Ratio:
    |      1.000 |    797.389 |      0.749 |      0.678

Test for A[256][256] * B[256][256]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |      32418 |    7992363 |      11753 |      11712
 2. |      32723 |    7961725 |      12342 |      12490
 3. |      32150 |    8041516 |      14646 |      12304
 4. |      30207 |    7810907 |      11512 |      11992
 5. |      30108 |    8005317 |      12853 |      12850
 6. |      32665 |    8064963 |      13197 |      13386
 7. |      36286 |    8825215 |      14381 |      15636
 8. |      35068 |    8015930 |      16954 |      12287
 9. |      30673 |    7973273 |      12061 |      13677
10. |      36323 |    7861856 |      14223 |      13510
Average Values:
    |    32862.1 |  8055306.5 |    13392.2 |    12984.4
Ratio:
    |      1.000 |    245.125 |      0.408 |      0.395

Test for A[512][512] * B[512][512]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |     404459 |   32803878 |     107078 |     103493
 2. |     289870 |   32482887 |      98244 |     103338
 3. |     333695 |   29398109 |      87735 |      77531
 4. |     236028 |   27286537 |      81620 |      76085
 5. |     254294 |   27418963 |      89191 |      76760
 6. |     230662 |   27278077 |      78454 |      84063
 7. |     274278 |   27180899 |      74828 |      83829
 8. |     292294 |   29942221 |     106133 |     103450
 9. |     292091 |   33011277 |     100545 |      96935
10. |     401007 |   33502134 |      98230 |      95592
Average Values:
    |   300867.8 | 30030498.2 |    92205.8 |    90107.6
Ratio:
    |      1.000 |     99.813 |      0.306 |      0.299

I did the same with VS2013 (release mode) and got similar results.

A speed-up of 3 sounds not that bad (ignoring the fact that it’s still far away from 8 which you might expect as ideal for a H/W concurrency of 8).

While fiddling with the matrix multiplication, I got an idea for optimization which I wanted to check as well – even beyond multi-threading. It is the attempt to improve cache-locality.

For this, I transpose the 2^nd matrix before multiplication. For the multiplication, a modified version of dot() (dotT()) is used which considers the transposition of 2^nd matrix respectively.

I modified the above sample code respectively and got test-single-threading-matrix-transpose.cc:

#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <vector>

template <typename VALUE>
class MatrixT {
  public:
    typedef VALUE Value;
  private:
    size_t _nRows, _nCols;
    std::vector<Value> _values;
  public:
    MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
      _nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
    { }
    ~MatrixT() = default;

    size_t getNumCols() const { return _nCols; }
    size_t getNumRows() const { return _nRows; }
    Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
    const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};

template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
  const size_t n = mat1.getNumCols();
  assert(n == mat2.getNumRows());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
  return sum;
}

template <typename VALUE>
MatrixT<VALUE> transpose(const MatrixT<VALUE> mat)
{
  MatrixT<VALUE> matT(mat.getNumCols(), mat.getNumRows());
  for (size_t i = 0; i < mat.getNumRows(); ++i) {
    for (size_t j = 0; j < mat.getNumCols(); ++j) {
      matT[j][i] = mat[i][j];
    }
  }
  return matT;
}

template <typename VALUE>
VALUE dotT(const MatrixT<VALUE> &mat1, size_t iRow1, const MatrixT<VALUE> &matT2, size_t iRow2)
{
  const size_t n = mat1.getNumCols();
  assert(n == matT2.getNumCols());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow1][i] * matT2[iRow2][i];
  return sum;
}

typedef MatrixT<double> Matrix;

typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;

Time duration(const Clock::time_point &t0)
{
  return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}

Matrix populate(size_t dim)
{
  Matrix mat(dim, dim);
  for (size_t i = 0; i < dim; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
    }
  }
  return mat;
}

std::vector<Time> makeTest(size_t dim)
{
  // make a test sample
  const Matrix sampleA = populate(dim);
  const Matrix sampleB = populate(dim);
  // prepare result vectors
  Matrix results2[2] = {
    Matrix(dim, dim),
    Matrix(dim, dim)
  };
  // make test
  std::vector<Time> times{
    [&]() { // single threading
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results2[0];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment single-threaded
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dot(a, i, b, j);
      }
      // done
      return duration(t0);
    }(),
    [&]() { // single threading - with transposed matrix
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results2[1];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      const Matrix bT = transpose(b);
      // do experiment single-threaded with transposed B
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dotT(a, i, bT, j);
      }
      // done
      return duration(t0);
    }()
  };
  // check results (must be equal for any kind of computation)
  const unsigned nResults = sizeof results2 / sizeof *results2;
  for (unsigned iResult = 1; iResult < nResults; ++iResult) {
    size_t nErrors = 0;
    for (size_t i = 0; i < dim; ++i) {
      for (size_t j = 0; j < dim; ++j) {
        if (results2[0][i][j] != results2[iResult][i][j]) {
          ++nErrors;
#if 0 // def _DEBUG
          std::cerr
            << "results2[0][" << i << "]["  << j << "]: "
            << results2[0][i][j]
            << " != results2[" << iResult << "][" << i << "][" << j << "]: "
            << results2[iResult][i][j]
            << "!\n";
#endif // _DEBUG
        }
      }
    }
    if (nErrors) std::cerr << nErrors << " errors in results2[" << iResult << "]!\n";
  }
  // done
  return times;
}

int main()
{
  // heat up
  std::cout << "Heat up...\n";
  for (unsigned i = 0; i < 10; ++i) makeTest(64);
  // perform tests:
  const unsigned NTrials = 10;
  for (size_t dim = 64; dim <= 512; dim *= 2) {
    std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
    // repeat NTrials times
    std::cout << "Measuring " << NTrials << " runs...\n"
      << "   "
      << " | " << std::setw(10) << "A * B"
      << " | " << std::setw(10) << "A *T B^T"
      << '\n';
    std::vector<double> sumTimes;
    for (unsigned i = 0; i < NTrials; ++i) {
      std::vector<Time> times = makeTest(dim);
      std::cout << std::setw(2) << (i + 1) << ".";
      for (const Time &time : times) {
        std::cout << " | " << std::setw(10) << time.count();
      }
      std::cout << '\n';
      sumTimes.resize(times.size(), 0.0);
      for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
    }
    std::cout << "Average Values:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(1)
        << sumTime / NTrials;
    }
    std::cout << '\n';
    std::cout << "Ratio:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(3)
        << sumTime / sumTimes.front();
    }
    std::cout << "\n\n";
  }
  // done
  return 0;
}

I compiled and run again on cygwin64 (on Windows 10):

$ g++ -std=c++17 -O2 test-single-threading-matrix-transpose.cc -o test-single-threading-matrix-transpose && ./test-single-threading-matrix-transpose
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |        394 |        366
 2. |        394 |        368
 3. |        396 |        367
 4. |        382 |        368
 5. |        392 |        289
 6. |        297 |        343
 7. |        360 |        341
 8. |        399 |        358
 9. |        385 |        354
10. |        406 |        374
Average Values:
    |      380.5 |      352.8
Ratio:
    |      1.000 |      0.927

Test for A[128][128] * B[128][128]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |       2972 |       2558
 2. |       3317 |       2556
 3. |       3279 |       2689
 4. |       2952 |       2213
 5. |       2745 |       2668
 6. |       2981 |       2457
 7. |       2164 |       2274
 8. |       2634 |       2106
 9. |       2126 |       2389
10. |       3015 |       2477
Average Values:
    |     2818.5 |     2438.7
Ratio:
    |      1.000 |      0.865

Test for A[256][256] * B[256][256]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |      31312 |      17656
 2. |      29249 |      17127
 3. |      32127 |      16865
 4. |      29655 |      17287
 5. |      32137 |      17687
 6. |      29788 |      16732
 7. |      32251 |      16549
 8. |      32272 |      16257
 9. |      28019 |      18042
10. |      30334 |      17936
Average Values:
    |    30714.4 |    17213.8
Ratio:
    |      1.000 |      0.560

Test for A[512][512] * B[512][512]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |     322005 |     135102
 2. |     310180 |     134897
 3. |     329994 |     134304
 4. |     335375 |     137701
 5. |     330754 |     134252
 6. |     353761 |     136732
 7. |     359234 |     135632
 8. |     351498 |     134389
 9. |     360754 |     135751
10. |     368602 |     137139
Average Values:
    |   342215.7 |   135589.9
Ratio:
    |      1.000 |      0.396

Impressive, isn’t it?

It achieves similar speed-ups like the above multi-threading attempts (I mean the better ones) but with a single core.

The additional effort for transposing the 2^nd matrix (which is considered in measurement) does more than amortize. This isn’t that surprising because there a so many more read-accesses in multiplication (which now access consecutive bytes) compared to the additional effort to construct/write the transposed matrix once.

More Related Contents:

Leave a Comment Cancel reply