前面幾篇都是關(guān)于cuda的基礎(chǔ)知識(shí)介紹,甚是乏味。是時(shí)候用cuda渲染一張圖片來(lái)直觀地感受一下了!
Kevin Beason大神曾經(jīng)寫(xiě)過(guò)一個(gè)99行的c++版本的path tracer。麻雀雖小,五臟俱全,這個(gè)path tracer包含了很多global illumination的特性:
- Monte Carlo samling
- OpenMP
- Specular, Diffuse, and glass BRDFs
- Antialising via super-sampling with importance-sampled tent distribution
- Cosine importance sampling of the hemisphere for diffuse reflection
- Russian roulette for path termination
- Russian roulette and splitting for selecting reflection and/or refraction for glass BRDF
為了方便起見(jiàn),我去掉了不必要的特性,只保留了diffuse材質(zhì)。源代碼請(qǐng)從官網(wǎng)上獲取。修改后的代碼如下:
smallPT.cpp
#include <cmath>
#include <cstdio>
#include <chrono>
#define BOUNCE 4
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
struct Vec
{
double x, y, z;
Vec(double x_ = 0, double y_ = 0, double z_ = 0) { x = x_; y = y_; z = z_; }
Vec operator+(const Vec &b) const { return Vec(x + b.x, y + b.y, z + b.z); }
Vec operator-(const Vec &b) const { return Vec(x - b.x, y - b.y, z - b.z); }
Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }
Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }
Vec& norm() { return *this = *this * (1 / sqrt(x*x + y*y + z*z)); }
double dot(const Vec &b) const { return x*b.x + y*b.y + z*b.z; }
Vec operator%(Vec&b) { return Vec(y*b.z - z*b.y, z*b.x - x*b.z, x*b.y - y*b.x); }
};
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR }; // diffuse, specular, refraction
struct Sphere
{
double rad;
Vec p, e, c; // position, emission, color
Refl_t refl;
Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_) :
rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
double intersect(const Ray &r) const
{
Vec op = p - r.o;
double t, eps = 1e-4, b = op.dot(r.d), det = b*b - op.dot(op) + rad*rad;
if (det < 0) return 0; else det = sqrt(det);
return (t = b - det) > eps ? t : ((t = b + det) > eps ? t : 0);
}
};
Sphere spheres[] =
{
Sphere(1e5, Vec(-1e5f - 50.0f, 40.0f, 80.0f), Vec(),Vec(.75,.25,.25),DIFF), // Left
Sphere(1e5, Vec(1e5f + 50.0f, 40.0f, 80.0f),Vec(),Vec(.25,.25,.75),DIFF), // Rght
Sphere(1e5, Vec(0.0f, 40.0f, -1e5f), Vec(),Vec(.75,.75,.75),DIFF), // Back
Sphere(1e5, Vec(0.0f, 40.0f, 1e5f + 600.0f), Vec(),Vec(1, 1, 1), DIFF), // Frnt
Sphere(1e5, Vec(0.0f, -1e5f, 80.0f ), Vec(),Vec(.75,.75,.75),DIFF), // Botm
Sphere(1e5, Vec(0.0f, 1e5f + 80.0f, 80.0f),Vec(),Vec(.75,.75,.75),DIFF), // Top
Sphere(16,Vec(-25.0f, 16.0f, 47.0f), Vec(),Vec(1,1,1), DIFF), // Mirr
Sphere(20,Vec(25.0f, 20.0f, 78.0f), Vec(),Vec(1,1,1), DIFF), // Glas
Sphere(600, Vec(0.0f, 678.8f, 80.0f),Vec(1.6f, 1.6f, 1.6f), Vec(), DIFF) // Lite
};
float rand(unsigned int *seed0, unsigned int *seed1)
{
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
union
{
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; // bitwise AND, bitwise OR
return (res.f - 2.f) / 2.f;
}
inline double clamp(double x) { return x < 0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x) { return int(pow(clamp(x), 1 / 2.2) * 255 + .5); }
inline bool intersect(const Ray &r, double &t, int &id) {
double n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20;
for (int i = int(n); i--;) if ((d = spheres[i].intersect(r)) && d < t) { t = d; id = i; }
return t < inf;
}
Vec radiance(const Ray &r, int depth, unsigned int *s0, unsigned int *s1)
{
double t; // distance to intersection
int id = 0; // id of intersected object
if (depth > BOUNCE || !intersect(r, t, id)) return Vec(); // if miss, return black
const Sphere &obj = spheres[id]; // the hit object
Vec x = r.o + r.d*t, n = (x - obj.p).norm(), nl = n.dot(r.d) < 0 ? n : n*-1, f = obj.c;
if (obj.refl == DIFF)
{
double r1 = 2 * M_PI*rand(s0, s1), r2 = rand(s0, s1), r2s = sqrt(r2);
Vec w = nl, u = ((fabs(w.x) > .1 ? Vec(0, 1) : Vec(1)) % w).norm(), v = w%u;
Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1 - r2)).norm();
return obj.e + f.mult(radiance(Ray(x, d), depth + 1, s0, s1)) * n.dot(d) * 2;
}
return Vec();
}
int main()
{
Ray cam(Vec(0, 52, 300), Vec(0, -0.05, -1.0).norm()); // cam pos, dir
Vec cx = Vec(WIDTH*.5135 / HEIGHT), cy = (cx%cam.d).norm()*.5135, r, *c = new Vec[WIDTH*HEIGHT];
std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
#pragma omp parallel for schedule(dynamic, 1) private(r) // OpenMP
for (int y = 0; y < HEIGHT; y++) // Loop over image rows
{
fprintf(stderr, "\rRendering (%d spp) %5.2f%%", SPP, 100.*y / (HEIGHT - 1));
for (int x = 0; x < WIDTH; x++) // Loop cols
{
unsigned int s0 = x;
unsigned int s1 = y;
int i = (HEIGHT - y - 1)*WIDTH + x;
r = Vec();
for (int s = 0; s < SPP; s++)
{
Vec d = cx*((0.25 + x) / WIDTH - .5) + cy*((0.25 + y) / HEIGHT - .5) + cam.d;
r = r + radiance(Ray(cam.o + d * 40, d.norm()), 0, &s0, &s1);
}
r = r * (1.0 / SPP);
c[i] = Vec(clamp(r.x), clamp(r.y), clamp(r.z));
}
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsedTime = end - begin;
printf("\nTime: %.6lfs\n", elapsedTime.count());
FILE *f = fopen("image.ppm", "w"); // Write image to PPM file.
fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
for (int i = 0; i < WIDTH*HEIGHT; i++)
fprintf(f, "%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}
Note
1.這段代碼使用OpenMP進(jìn)行多線程加速。
OpenMP(Open Multi-Processing)是一套支持跨平臺(tái)共享內(nèi)存方式的多線程并發(fā)的編程API
在vs2015中需要手動(dòng)啟用OpenMP功能:

2.生成的ppm圖片文件可以用ps等軟件打開(kāi)
對(duì)應(yīng)的cuda代碼如下:
smallPT.cu
// cuda headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
// c++ headers
#include <cstdio>
#include <algorithm>
#include <climits>
#include <chrono>
#include "helper_math.h"
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
#define BOUNCE 4
#define SPHERE_EPSILON 0.0001f
#define BOX_EPSILON 0.001f
#define RAY_EPSILON 0.05f;
struct Ray
{
__device__ Ray(float3 pos, float3 dir) :
pos(pos), dir(dir) {}
float3 pos;
float3 dir;
};
enum class Material { Diffuse, Specular, Refractive };
struct Sphere
{
__device__ float intersect(const Ray &ray) const
{
float t;
float3 dis = pos - ray.pos;
float proj = dot(dis, ray.dir);
float delta = proj * proj - dot(dis, dis) + radius * radius;
if (delta < 0) return 0;
delta = sqrtf(delta);
return (t = proj - delta) > SPHERE_EPSILON ? t : ((t = proj + delta) > SPHERE_EPSILON ? t : 0);
}
float radius;
float3 pos, emissionColor, mainColor;
Material material;
};
__constant__ Sphere spheres[] =
{
{ 1e5f, { -1e5f - 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, Material::Diffuse }, // Left
{ 1e5f, { 1e5f + 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.25f, 0.25f, 0.75f }, Material::Diffuse }, // Right
{ 1e5f, { 0.0f, 40.0f, -1e5f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Back
{ 1e5f, { 0.0f, 40.0f, 1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, Material::Diffuse }, // Front
{ 1e5f, { 0.0f, -1e5f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Bottom
{ 1e5f, { 0.0f, 1e5f + 80.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Top
{ 16.0f, { -25.0f, 16.0f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 1
{ 20.0f, { 25.0f, 20.0f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 2
{ 600.0f, { 0.0f, 678.8f, 80.0f }, { 1.6f, 1.6f, 1.6f }, { 0.0f, 0.0f, 0.0f }, Material::Diffuse } // Light
};
__device__ inline bool intersectScene(const Ray &ray, float &t, int &id)
{
t = FLT_MAX, id = -1;
int sphereNum = sizeof(spheres) / sizeof(Sphere);
for (int i = 0; i < sphereNum; i++)
{
float ct = spheres[i].intersect(ray);
if (ct != 0 && ct < t)
{
t = ct;
id = i;
}
}
return id != -1;
}
__device__ static float rand(uint *seed0, uint *seed1)
{
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
uint ires = ((*seed0) << 16) + (*seed1);
union
{
float f;
uint ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; // bitwise AND, bitwise OR
return (res.f - 2.f) / 2.f;
}
inline int gammaCorrect(float c)
{
return int(pow(clamp(c, 0.0f, 1.0f), 1 / 2.2) * 255 + .5);
}
__device__ float3 pathTrace(Ray &ray, uint *s1, uint *s2)
{
float3 accumColor = make_float3(0.0f, 0.0f, 0.0f);
float3 mask = make_float3(1.0f, 1.0f, 1.0f);
for (int i = 0; i < BOUNCE; i++)
{
float t;
int id;
if (!intersectScene(ray, t, id))
return make_float3(0.0f, 0.0f, 0.0f);
const Sphere &obj = spheres[id];
float3 p = ray.pos + ray.dir * t;
float3 n = normalize(p - obj.pos);
float3 nl = dot(n, ray.dir) < 0 ? n : n * -1;
accumColor += mask * obj.emissionColor;
float r1 = rand(s1, s2) * M_PI * 2;
float r2 = rand(s1, s2);
float r2s = sqrtf(r2);
float3 w = nl;
float3 u = normalize(cross((std::fabs(w.x) > std::fabs(w.y) ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
float3 v = cross(w, u);
float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));
ray.pos = p + nl * RAY_EPSILON;
ray.dir = d;
mask *= obj.mainColor * dot(d, nl) * 2;
}
return accumColor;
}
__global__ void pathTracekernel(float3 *h_output)
{
uint x = blockIdx.x * blockDim.x + threadIdx.x;
uint y = blockIdx.y * blockDim.y + threadIdx.y;
uint i = (HEIGHT - y - 1)*WIDTH + x;
uint s1 = x;
uint s2 = y;
Ray camRay(make_float3(0.0f, 52.0f, 300.0f), normalize(make_float3(0.0f, -0.05f, -1.0f)));
float3 cx = make_float3(WIDTH * 0.5135 / HEIGHT, 0.0f, 0.0f);
float3 cy = normalize(cross(cx, camRay.dir)) * 0.5135;
float3 pixel = make_float3(0.0f);
for (int s = 0; s < SPP; s++)
{
float3 d = camRay.dir + cx*((0.25 + x) / WIDTH - 0.5) + cy*((0.25 + y) / HEIGHT - 0.5);
Ray ray(camRay.pos + d * 40, normalize(d));
pixel += pathTrace(ray, &s1, &s2)*(1. / SPP);
}
h_output[i] = clamp(pixel, 0.0f, 1.0f);
}
int main() {
float3 *h_output = new float3[WIDTH * HEIGHT];
float3 *d_output;
cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float3));
dim3 block(8, 8, 1);
dim3 grid(WIDTH / block.x, HEIGHT / block.y, 1);
printf("CUDA initialized.\nStart rendering...\n");
std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
pathTracekernel << <grid, block >> > (d_output);
cudaMemcpy(h_output, d_output, WIDTH * HEIGHT * sizeof(float3), cudaMemcpyDeviceToHost);
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsedTime = end - begin;
printf("Time: %.6lfs\n", elapsedTime.count());
cudaFree(d_output);
printf("Done!\n");
FILE *f = fopen("smallptcuda.ppm", "w");
fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
for (int i = 0; i < WIDTH * HEIGHT; i++)
fprintf(f, "%d %d %d ", gammaCorrect(h_output[i].x),
gammaCorrect(h_output[i].y),
gammaCorrect(h_output[i].z));
printf("Saved image to 'smallptcuda.ppm'.\n");
delete[] h_output;
return 0;
}
Note
smallPT.cu中包含了一個(gè)helper_math.h頭文件,該文件實(shí)現(xiàn)了float3等cuda基本數(shù)據(jù)類型的常見(jiàn)操作,例如+ - * dot cross等運(yùn)算。該頭文件位于C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\common\inc目錄下。
由于在這段代碼里,核函數(shù)的單次運(yùn)行時(shí)間會(huì)超出系統(tǒng)的默認(rèn)鎖定時(shí)間,所以會(huì)導(dǎo)致系統(tǒng)認(rèn)為程序出錯(cuò)而結(jié)束進(jìn)程。詳見(jiàn)微軟的官網(wǎng)說(shuō)明。所以我們需要在注冊(cè)表中修改系統(tǒng)對(duì)核函數(shù)的監(jiān)控項(xiàng)。在HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers下添加如下Tdr項(xiàng):

TdrDelay Specifies the number of seconds that the GPU can delay the preempt request from the GPU scheduler. This is effectively the timeout threshold. The default value is 2 seconds.
TdrDelay可以被理解為核函數(shù)單次運(yùn)行的最長(zhǎng)時(shí)間。
說(shuō)明
目前我不會(huì)詳細(xì)解釋代碼的原理,只是想讓大家直觀的認(rèn)識(shí)一下path tracing。具體的原理說(shuō)明會(huì)放在之后的教程里。
Profile
參數(shù)
圖片尺寸(Width*Height):800*600
反射次數(shù)(Bounce):4
像素采樣數(shù)(Spp, Samples per pixel):1024
smallPT.cpp
渲染時(shí)間
without OpenMP:595.3s

with OpenMP:107.3s

結(jié)果圖片

smallPT.cu
渲染時(shí)間:6.3s

結(jié)果圖片

反射次數(shù)(Bounce):16
像素采樣數(shù)(Spp, Samples per pixel):2048
渲染時(shí)間:26.9s

結(jié)果圖片

由于反射次數(shù)增加,indirect radiance會(huì)增加,場(chǎng)景的亮度也隨之增加;像素采樣率的提高會(huì)使得渲染圖片更加平滑。
分析
可以看出,cpu(smallPT.cpp)和gpu(smallPT.cu)的結(jié)果圖片基本一致(亮度有些差別,可能和float和double的精度有關(guān)),但是時(shí)間差距很大。在gpu上運(yùn)行的速度要比在cpu上快107.3/6.3=17倍,比不使用OpenMP多線程加速更要快上595.3/6.3=94倍。所以說(shuō),把path tracing這種計(jì)算密集型的算法搬到gpu上跑是十分有必要的。
let's step to path tracing now!