// This sample demonstrates working on one piece of data using two GPUs.
// It splits input into two parts and processes them separately on different GPUs.
#ifdef WIN32
#define NOMINMAX
#include <windows.h>
#else
#include <pthread.h>
#include <unistd.h>
#endif
#include <iostream>
#include <iomanip>
#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudastereo.hpp"
#include "tick_meter.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
///////////////////////////////////////////////////////////
// Thread
// OS-specific wrappers for multi-threading
#ifdef WIN32
class Thread
{
struct UserData
{
void (*func)(void* userData);
void* param;
};
static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
{
UserData* userData = static_cast<UserData*>(lpParam);
userData->func(userData->param);
return 0;
}
UserData userData_;
HANDLE thread_;
DWORD threadId_;
public:
Thread(void (*func)(void* userData), void* userData)
{
userData_.func = func;
userData_.param = userData;
thread_ = CreateThread(
NULL, // default security attributes
0, // use default stack size
WinThreadFunction, // thread function name
&userData_, // argument to thread function
0, // use default creation flags
&threadId_); // returns the thread identifier
}
~Thread()
{
CloseHandle(thread_);
}
void wait()
{
WaitForSingleObject(thread_, INFINITE);
}
};
#else
class Thread
{
struct UserData
{
void (*func)(void* userData);
void* param;
};
static void* PThreadFunction(void* lpParam)
{
UserData* userData = static_cast<UserData*>(lpParam);
userData->func(userData->param);
return 0;
}
pthread_t thread_;
UserData userData_;
public:
Thread(void (*func)(void* userData), void* userData)
{
userData_.func = func;
userData_.param = userData;
pthread_create(&thread_, NULL, PThreadFunction, &userData_);
}
~Thread()
{
pthread_detach(thread_);
}
void wait()
{
pthread_join(thread_, NULL);
}
};
#endif
///////////////////////////////////////////////////////////
// StereoSingleGpu
// Run Stereo algorithm on single GPU
class StereoSingleGpu
{
public:
explicit StereoSingleGpu(int deviceId = 0);
~StereoSingleGpu();
void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
private:
int deviceId_;
GpuMat d_leftFrame;
GpuMat d_rightFrame;
GpuMat d_disparity;
Ptr<cuda::StereoBM> d_alg;
};
StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
{
cuda::setDevice(deviceId_);
d_alg = cuda::createStereoBM(256);
}
StereoSingleGpu::~StereoSingleGpu()
{
cuda::setDevice(deviceId_);
d_leftFrame.release();
d_rightFrame.release();
d_disparity.release();
d_alg.release();
}
void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
{
cuda::setDevice(deviceId_);
d_leftFrame.upload(leftFrame);
d_rightFrame.upload(rightFrame);
d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
d_disparity.download(disparity);
}
///////////////////////////////////////////////////////////
// StereoMultiGpuThread
// Run Stereo algorithm on two GPUs using different host threads
class StereoMultiGpuThread
{
public:
StereoMultiGpuThread();
~StereoMultiGpuThread();
void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
private:
GpuMat d_leftFrames[2];
GpuMat d_rightFrames[2];
GpuMat d_disparities[2];
Ptr<cuda::StereoBM> d_algs[2];
struct StereoLaunchData
{
int deviceId;
Mat leftFrame;
Mat rightFrame;
Mat disparity;
GpuMat* d_leftFrame;
GpuMat* d_rightFrame;
GpuMat* d_disparity;
Ptr<cuda::StereoBM> d_alg;
};
static void launchGpuStereoAlg(void* userData);
};
StereoMultiGpuThread::StereoMultiGpuThread()
{
cuda::setDevice(0);
d_algs[0] = cuda::createStereoBM(256);
cuda::setDevice(1);
d_algs[1] = cuda::createStereoBM(256);
}
StereoMultiGpuThread::~StereoMultiGpuThread()
{
cuda::setDevice(0);
d_leftFrames[0].release();
d_rightFrames[0].release();
d_disparities[0].release();
d_algs[0].release();
cuda::setDevice(1);
d_leftFrames[1].release();
d_rightFrames[1].release();
d_disparities[1].release();
d_algs[1].release();
}
void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
{
disparity.create(leftFrame.size(), CV_8UC1);
// Split input data onto two parts for each GPUs.
// We add small border for each part,
// because original algorithm doesn't calculate disparity on image borders.
// With such padding we will get output in the middle of final result.
StereoLaunchData launchDatas[2];
launchDatas[0].deviceId = 0;
launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
launchDatas[0].d_leftFrame = &d_leftFrames[0];
launchDatas[0].d_rightFrame = &d_rightFrames[0];
launchDatas[0].d_disparity = &d_disparities[0];
launchDatas[0].d_alg = d_algs[0];
launchDatas[1].deviceId = 1;
launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
launchDatas[1].d_leftFrame = &d_leftFrames[1];
launchDatas[1].d_rightFrame = &d_rightFrames[1];
launchDatas[1].d_disparity = &d_disparities[1];
launchDatas[1].d_alg = d_algs[1];
Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
thread0.wait();
thread1.wait();
}
void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
{
StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
cuda::setDevice(data->deviceId);
data->d_leftFrame->upload(data->leftFrame);
data->d_rightFrame->upload(data->rightFrame);
data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
if (data->deviceId == 0)
data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
else
data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
}
///////////////////////////////////////////////////////////
// StereoMultiGpuStream
// Run Stereo algorithm on two GPUs from single host thread using async API
class StereoMultiGpuStream
{
public:
StereoMultiGpuStream();
~StereoMultiGpuStream();
void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
private:
GpuMat d_leftFrames[2];
GpuMat d_rightFrames[2];
GpuMat d_disparities[2];
Ptr<cuda::StereoBM> d_algs[2];
Ptr<Stream> streams[2];
};
StereoMultiGpuStream::StereoMultiGpuStream()
{
cuda::setDevice(0);
d_algs[0] = cuda::createStereoBM(256);
streams[0] = makePtr<Stream>();
cuda::setDevice(1);
d_algs[1] = cuda::createStereoBM(256);
streams[1] = makePtr<Stream>();
}
StereoMultiGpuStream::~StereoMultiGpuStream()
{
cuda::setDevice(0);
d_leftFrames[0].release();
d_rightFrames[0].release();
d_disparities[0].release();
d_algs[0].release();
streams[0].release();
cuda::setDevice(1);
d_leftFrames[1].release();
d_rightFrames[1].release();
d_disparities[1].release();
d_algs[1].release();
streams[1].release();
}
void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
{
disparity.create(leftFrame.size(), CV_8UC1);
// Split input data onto two parts for each GPUs.
// We add small border for each part,
// because original algorithm doesn't calculate disparity on image borders.
// With such padding we will get output in the middle of final result.
Mat leftFrameHdr = leftFrame.createMatHeader();
Mat rightFrameHdr = rightFrame.createMatHeader();
Mat disparityHdr = disparity.createMatHeader();
Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
cuda::setDevice(0);
d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
cuda::setDevice(1);
d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
cuda::setDevice(0);
streams[0]->waitForCompletion();
cuda::setDevice(1);
streams[1]->waitForCompletion();
}
///////////////////////////////////////////////////////////
// main
int main(int argc, char** argv)
{
if (argc != 3)
{
cerr << "Usage: stereo_multi_gpu <left_video> <right_video>" << endl;
return -1;
}
const int numDevices = getCudaEnabledDeviceCount();
if (numDevices != 2)
{
cerr << "Two GPUs are required" << endl;
return -1;
}
for (int i = 0; i < numDevices; ++i)
{
DeviceInfo devInfo(i);
if (!devInfo.isCompatible())
{
cerr << "CUDA module was't built for GPU #" << i << " ("
<< devInfo.name() << ", CC " << devInfo.majorVersion()
<< devInfo.minorVersion() << endl;
return -1;
}
printShortCudaDeviceInfo(i);
}
VideoCapture leftVideo(argv[1]);
VideoCapture rightVideo(argv[2]);
if (!leftVideo.isOpened())
{
cerr << "Can't open " << argv[1] << " video file" << endl;
return -1;
}
if (!rightVideo.isOpened())
{
cerr << "Can't open " << argv[2] << " video file" << endl;
return -1;
}
cout << endl;
cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
cout << endl;
Mat leftFrame, rightFrame;
HostMem leftGrayFrame, rightGrayFrame;
StereoSingleGpu gpu0Alg(0);
StereoSingleGpu gpu1Alg(1);
StereoMultiGpuThread multiThreadAlg;
StereoMultiGpuStream multiStreamAlg;
Mat disparityGpu0;
Mat disparityGpu1;
Mat disparityMultiThread;
HostMem disparityMultiStream;
Mat disparityGpu0Show;
Mat disparityGpu1Show;
Mat disparityMultiThreadShow;
Mat disparityMultiStreamShow;
TickMeter tm;
cout << "-------------------------------------------------------------------" << endl;
cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
cout << "-------------------------------------------------------------------" << endl;
for (int i = 0;; ++i)
{
leftVideo >> leftFrame;
rightVideo >> rightFrame;
if (leftFrame.empty() || rightFrame.empty())
break;
if (leftFrame.size() != rightFrame.size())
{
cerr << "Frames have different sizes" << endl;
return -1;
}
leftGrayFrame.create(leftFrame.size(), CV_8UC1);
rightGrayFrame.create(leftFrame.size(), CV_8UC1);
cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
tm.reset(); tm.start();
gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
disparityGpu0);
tm.stop();
const double gpu0Time = tm.getTimeMilli();
tm.reset(); tm.start();
gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
disparityGpu1);
tm.stop();
const double gpu1Time = tm.getTimeMilli();
tm.reset(); tm.start();
multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
disparityMultiThread);
tm.stop();
const double multiThreadTime = tm.getTimeMilli();
tm.reset(); tm.start();
multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
tm.stop();
const double multiStreamTime = tm.getTimeMilli();
cout << "| " << setw(5) << i << " | "
<< setw(8) << setprecision(1) << fixed << gpu0Time << " | "
<< setw(8) << setprecision(1) << fixed << gpu1Time << " | "
<< setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
<< setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
imshow("disparityGpu0", disparityGpu0Show);
imshow("disparityGpu1", disparityGpu1Show);
imshow("disparityMultiThread", disparityMultiThreadShow);
imshow("disparityMultiStream", disparityMultiStreamShow);
const int key = waitKey(30) & 0xff;
if (key == 27)
break;
}
cout << "-------------------------------------------------------------------" << endl;
return 0;
}