丝袜人妻一区,中国视频999

前面幾篇都是關(guān)于cuda的基礎(chǔ)知識(shí)介紹，甚是乏味。是時(shí)候用cuda渲染一張圖片來(lái)直觀地感受一下了！

Kevin Beason大神曾經(jīng)寫(xiě)過(guò)一個(gè)99行的c++版本的path tracer。麻雀雖小，五臟俱全，這個(gè)path tracer包含了很多global illumination的特性：

Monte Carlo samling

OpenMP
Specular, Diffuse, and glass BRDFs
Antialising via super-sampling with importance-sampled tent distribution
Cosine importance sampling of the hemisphere for diffuse reflection
Russian roulette for path termination
Russian roulette and splitting for selecting reflection and/or refraction for glass BRDF

為了方便起見(jiàn)，我去掉了不必要的特性，只保留了diffuse材質(zhì)。源代碼請(qǐng)從官網(wǎng)上獲取。修改后的代碼如下：

smallPT.cpp

#include <cmath>
#include <cstdio>
#include <chrono>

#define BOUNCE 4
#define WIDTH 800
#define HEIGHT 600
#define SPP 1024

struct Vec
{
    double x, y, z;
    Vec(double x_ = 0, double y_ = 0, double z_ = 0) { x = x_; y = y_; z = z_; }
    Vec operator+(const Vec &b) const { return Vec(x + b.x, y + b.y, z + b.z); }
    Vec operator-(const Vec &b) const { return Vec(x - b.x, y - b.y, z - b.z); }
    Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }
    Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }
    Vec& norm() { return *this = *this * (1 / sqrt(x*x + y*y + z*z)); }
    double dot(const Vec &b) const { return x*b.x + y*b.y + z*b.z; }
    Vec operator%(Vec&b) { return Vec(y*b.z - z*b.y, z*b.x - x*b.z, x*b.y - y*b.x); }
};

struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };

enum Refl_t { DIFF, SPEC, REFR }; // diffuse, specular, refraction

struct Sphere
{
    double rad;
    Vec p, e, c; // position, emission, color
    Refl_t refl;
    Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_) :
        rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
    double intersect(const Ray &r) const
    {
        Vec op = p - r.o;
        double t, eps = 1e-4, b = op.dot(r.d), det = b*b - op.dot(op) + rad*rad;
        if (det < 0) return 0; else det = sqrt(det);
        return (t = b - det) > eps ? t : ((t = b + det) > eps ? t : 0);
    }
};

Sphere spheres[] =
{
    Sphere(1e5, Vec(-1e5f - 50.0f, 40.0f, 80.0f), Vec(),Vec(.75,.25,.25),DIFF), // Left 
    Sphere(1e5, Vec(1e5f + 50.0f, 40.0f, 80.0f),Vec(),Vec(.25,.25,.75),DIFF), // Rght 
    Sphere(1e5, Vec(0.0f, 40.0f, -1e5f),     Vec(),Vec(.75,.75,.75),DIFF), // Back 
    Sphere(1e5, Vec(0.0f, 40.0f, 1e5f + 600.0f), Vec(),Vec(1, 1, 1), DIFF), // Frnt 
    Sphere(1e5, Vec(0.0f, -1e5f, 80.0f ),    Vec(),Vec(.75,.75,.75),DIFF), // Botm 
    Sphere(1e5, Vec(0.0f, 1e5f + 80.0f, 80.0f),Vec(),Vec(.75,.75,.75),DIFF), // Top 
    Sphere(16,Vec(-25.0f, 16.0f, 47.0f),       Vec(),Vec(1,1,1), DIFF), // Mirr 
    Sphere(20,Vec(25.0f, 20.0f, 78.0f),       Vec(),Vec(1,1,1), DIFF), // Glas 
    Sphere(600, Vec(0.0f, 678.8f, 80.0f),Vec(1.6f, 1.6f, 1.6f),  Vec(), DIFF) // Lite 
};

float rand(unsigned int *seed0, unsigned int *seed1)
{
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    unsigned int ires = ((*seed0) << 16) + (*seed1);

    union
    {
        float f;
        unsigned int ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR

    return (res.f - 2.f) / 2.f;
}

inline double clamp(double x) { return x < 0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x) { return int(pow(clamp(x), 1 / 2.2) * 255 + .5); }
inline bool intersect(const Ray &r, double &t, int &id) {
    double n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20;
    for (int i = int(n); i--;) if ((d = spheres[i].intersect(r)) && d < t) { t = d; id = i; }
    return t < inf;
}

Vec radiance(const Ray &r, int depth, unsigned int *s0, unsigned int *s1)
{
    double t;                               // distance to intersection 
    int id = 0;                               // id of intersected object 
    if (depth > BOUNCE || !intersect(r, t, id)) return Vec(); // if miss, return black 
    const Sphere &obj = spheres[id];        // the hit object 
    Vec x = r.o + r.d*t, n = (x - obj.p).norm(), nl = n.dot(r.d) < 0 ? n : n*-1, f = obj.c;

    if (obj.refl == DIFF)
    {
        double r1 = 2 * M_PI*rand(s0, s1), r2 = rand(s0, s1), r2s = sqrt(r2);
        Vec w = nl, u = ((fabs(w.x) > .1 ? Vec(0, 1) : Vec(1)) % w).norm(), v = w%u;
        Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1 - r2)).norm();
        return obj.e + f.mult(radiance(Ray(x, d), depth + 1, s0, s1)) * n.dot(d) * 2;
    }
    return Vec();
}

int main()
{
    Ray cam(Vec(0, 52, 300), Vec(0, -0.05, -1.0).norm()); // cam pos, dir 
    Vec cx = Vec(WIDTH*.5135 / HEIGHT), cy = (cx%cam.d).norm()*.5135, r, *c = new Vec[WIDTH*HEIGHT];

    std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();
#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP 
    for (int y = 0; y < HEIGHT; y++) // Loop over image rows 
    {
        fprintf(stderr, "\rRendering (%d spp) %5.2f%%", SPP, 100.*y / (HEIGHT - 1));
        for (int x = 0; x < WIDTH; x++)   // Loop cols
        {
            unsigned int s0 = x;
            unsigned int s1 = y;
            int i = (HEIGHT - y - 1)*WIDTH + x;
            r = Vec();
            for (int s = 0; s < SPP; s++)
            {
                Vec d = cx*((0.25 + x) / WIDTH - .5) + cy*((0.25 + y) / HEIGHT - .5) + cam.d;
                r = r + radiance(Ray(cam.o + d * 40, d.norm()), 0, &s0, &s1);
            }
            r = r * (1.0 / SPP);
            c[i] = Vec(clamp(r.x), clamp(r.y), clamp(r.z));
        }
    }
    std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsedTime = end - begin;
    printf("\nTime: %.6lfs\n", elapsedTime.count());

    FILE *f = fopen("image.ppm", "w");         // Write image to PPM file. 
    fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
    for (int i = 0; i < WIDTH*HEIGHT; i++)
        fprintf(f, "%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}

Note

1.這段代碼使用OpenMP進(jìn)行多線程加速。

OpenMP（Open Multi-Processing）是一套支持跨平臺(tái)共享內(nèi)存方式的多線程并發(fā)的編程API

在vs2015中需要手動(dòng)啟用OpenMP功能：

vs2015中打開(kāi)OpenMP選項(xiàng)

2.生成的ppm圖片文件可以用ps等軟件打開(kāi)

對(duì)應(yīng)的cuda代碼如下：

smallPT.cu

// cuda headers
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

// c++ headers
#include <cstdio>
#include <algorithm>
#include <climits>
#include <chrono>

#include "helper_math.h"

#define WIDTH 800
#define HEIGHT 600
#define SPP 1024
#define BOUNCE 4
#define SPHERE_EPSILON 0.0001f
#define BOX_EPSILON 0.001f
#define RAY_EPSILON 0.05f;

struct Ray 
{
    __device__ Ray(float3 pos, float3 dir) :
        pos(pos), dir(dir) {}

    float3 pos;
    float3 dir;
};

enum class Material { Diffuse, Specular, Refractive };

struct Sphere 
{
    __device__ float intersect(const Ray &ray) const
    {
        float t;
        float3 dis = pos - ray.pos;
        float proj = dot(dis, ray.dir);
        float delta = proj * proj - dot(dis, dis) + radius * radius;

        if (delta < 0) return 0;

        delta = sqrtf(delta);
        return (t = proj - delta) > SPHERE_EPSILON ? t : ((t = proj + delta) > SPHERE_EPSILON ? t : 0);
    }

    float radius;
    float3 pos, emissionColor, mainColor;
    Material material;
};

__constant__ Sphere spheres[] =
{
    { 1e5f, { -1e5f - 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, Material::Diffuse }, // Left 
    { 1e5f, { 1e5f + 50.0f, 40.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.25f, 0.25f, 0.75f }, Material::Diffuse }, // Right 
    { 1e5f, { 0.0f, 40.0f, -1e5f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Back 
    { 1e5f, { 0.0f, 40.0f, 1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, Material::Diffuse }, // Front 
    { 1e5f, { 0.0f, -1e5f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Bottom 
    { 1e5f, { 0.0f, 1e5f + 80.0f, 80.0f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.75f, 0.75f }, Material::Diffuse }, // Top 
    { 16.0f, { -25.0f, 16.0f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 1
    { 20.0f, { 25.0f, 20.0f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, Material::Diffuse }, // small sphere 2
    { 600.0f, { 0.0f, 678.8f, 80.0f }, { 1.6f, 1.6f, 1.6f }, { 0.0f, 0.0f, 0.0f }, Material::Diffuse }  // Light
};

__device__ inline bool intersectScene(const Ray &ray, float &t, int &id) 
{
    t = FLT_MAX, id = -1;
    int sphereNum = sizeof(spheres) / sizeof(Sphere);
    for (int i = 0; i < sphereNum; i++)
    {
        float ct = spheres[i].intersect(ray);
        if (ct != 0 && ct < t)
        {  
            t = ct;
            id = i;
        }
    }
        
    return id != -1;
}

__device__ static float rand(uint *seed0, uint *seed1) 
{
    *seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16);
    *seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);

    uint ires = ((*seed0) << 16) + (*seed1);

    union 
    {
        float f;
        uint ui;
    } res;

    res.ui = (ires & 0x007fffff) | 0x40000000;  // bitwise AND, bitwise OR

    return (res.f - 2.f) / 2.f;
}

inline int gammaCorrect(float c)
{
    return int(pow(clamp(c, 0.0f, 1.0f), 1 / 2.2) * 255 + .5);
}

__device__ float3 pathTrace(Ray &ray, uint *s1, uint *s2) 
{ 
    float3 accumColor = make_float3(0.0f, 0.0f, 0.0f);
    float3 mask = make_float3(1.0f, 1.0f, 1.0f);

    for (int i = 0; i < BOUNCE; i++)
    {
        float t;
        int id;

        if (!intersectScene(ray, t, id))
            return make_float3(0.0f, 0.0f, 0.0f);

        const Sphere &obj = spheres[id];
        float3 p = ray.pos + ray.dir * t; 
        float3 n = normalize(p - obj.pos);
        float3 nl = dot(n, ray.dir) < 0 ? n : n * -1;

        accumColor += mask * obj.emissionColor;

        float r1 = rand(s1, s2) * M_PI * 2;
        float r2 = rand(s1, s2);
        float r2s = sqrtf(r2);

        float3 w = nl;
        float3 u = normalize(cross((std::fabs(w.x) > std::fabs(w.y) ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
        float3 v = cross(w, u);

        float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));

        ray.pos = p + nl * RAY_EPSILON;
        ray.dir = d;

        mask *= obj.mainColor * dot(d, nl) * 2;
    }

    return accumColor;
}

__global__ void pathTracekernel(float3 *h_output) 
{
    uint x = blockIdx.x * blockDim.x + threadIdx.x;
    uint y = blockIdx.y * blockDim.y + threadIdx.y;

    uint i = (HEIGHT - y - 1)*WIDTH + x;

    uint s1 = x;
    uint s2 = y;

    Ray camRay(make_float3(0.0f, 52.0f, 300.0f), normalize(make_float3(0.0f, -0.05f, -1.0f)));
    float3 cx = make_float3(WIDTH * 0.5135 / HEIGHT, 0.0f, 0.0f);
    float3 cy = normalize(cross(cx, camRay.dir)) * 0.5135;
    float3 pixel = make_float3(0.0f);

    for (int s = 0; s < SPP; s++) 
    {  
        float3 d = camRay.dir + cx*((0.25 + x) / WIDTH - 0.5) + cy*((0.25 + y) / HEIGHT - 0.5);
        Ray ray(camRay.pos + d * 40, normalize(d));

        pixel += pathTrace(ray, &s1, &s2)*(1. / SPP);
    }

    h_output[i] = clamp(pixel, 0.0f, 1.0f);
}

int main() {

    float3 *h_output = new float3[WIDTH * HEIGHT];
    float3 *d_output;

    cudaMalloc(&d_output, WIDTH * HEIGHT * sizeof(float3));

    dim3 block(8, 8, 1);
    dim3 grid(WIDTH / block.x, HEIGHT / block.y, 1);

    printf("CUDA initialized.\nStart rendering...\n");

    std::chrono::time_point<std::chrono::system_clock> begin = std::chrono::system_clock::now();

    pathTracekernel << <grid, block >> > (d_output);

    cudaMemcpy(h_output, d_output, WIDTH * HEIGHT * sizeof(float3), cudaMemcpyDeviceToHost);

    std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
    std::chrono::duration<double> elapsedTime = end - begin;
    printf("Time: %.6lfs\n", elapsedTime.count());

    cudaFree(d_output);

    printf("Done!\n");

    FILE *f = fopen("smallptcuda.ppm", "w");
    fprintf(f, "P3\n%d %d\n%d\n", WIDTH, HEIGHT, 255);
    for (int i = 0; i < WIDTH * HEIGHT; i++)
        fprintf(f, "%d %d %d ", gammaCorrect(h_output[i].x),
            gammaCorrect(h_output[i].y),
            gammaCorrect(h_output[i].z));

    printf("Saved image to 'smallptcuda.ppm'.\n");

    delete[] h_output;

    return 0;
}

Note

smallPT.cu中包含了一個(gè)helper_math.h頭文件，該文件實(shí)現(xiàn)了float3等cuda基本數(shù)據(jù)類型的常見(jiàn)操作，例如+ - * dot cross等運(yùn)算。該頭文件位于C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\common\inc目錄下。
由于在這段代碼里，核函數(shù)的單次運(yùn)行時(shí)間會(huì)超出系統(tǒng)的默認(rèn)鎖定時(shí)間，所以會(huì)導(dǎo)致系統(tǒng)認(rèn)為程序出錯(cuò)而結(jié)束進(jìn)程。詳見(jiàn)微軟的官網(wǎng)說(shuō)明。所以我們需要在注冊(cè)表中修改系統(tǒng)對(duì)核函數(shù)的監(jiān)控項(xiàng)。在HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers下添加如下Tdr項(xiàng)：

TdrDelay

TdrDelay Specifies the number of seconds that the GPU can delay the preempt request from the GPU scheduler. This is effectively the timeout threshold. The default value is 2 seconds.

TdrDelay可以被理解為核函數(shù)單次運(yùn)行的最長(zhǎng)時(shí)間。

說(shuō)明

目前我不會(huì)詳細(xì)解釋代碼的原理，只是想讓大家直觀的認(rèn)識(shí)一下path tracing。具體的原理說(shuō)明會(huì)放在之后的教程里。

Profile

參數(shù)

圖片尺寸（Width*Height）：800*600
反射次數(shù)（Bounce）：4
像素采樣數(shù)（Spp, Samples per pixel）：1024

smallPT.cpp

渲染時(shí)間
without OpenMP：595.3s

不用OpenMP加速

with OpenMP：107.3s

OpenMP加速

結(jié)果圖片

cpu渲染圖

smallPT.cu

渲染時(shí)間：6.3s

結(jié)果圖片

gpu渲染圖

反射次數(shù)（Bounce）：16
像素采樣數(shù)（Spp, Samples per pixel）：2048

渲染時(shí)間：26.9s

結(jié)果圖片

spp:2048, bounce:8

由于反射次數(shù)增加，indirect radiance會(huì)增加，場(chǎng)景的亮度也隨之增加；像素采樣率的提高會(huì)使得渲染圖片更加平滑。

分析

可以看出，cpu（smallPT.cpp）和gpu（smallPT.cu）的結(jié)果圖片基本一致（亮度有些差別，可能和float和double的精度有關(guān)），但是時(shí)間差距很大。在gpu上運(yùn)行的速度要比在cpu上快107.3/6.3=17倍，比不使用OpenMP多線程加速更要快上595.3/6.3=94倍。所以說(shuō)，把path tracing這種計(jì)算密集型的算法搬到gpu上跑是十分有必要的。

let's step to path tracing now!

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

教你一步步寫(xiě)一個(gè)cuda path tracer：牛刀小試！

教你一步步寫(xiě)一個(gè)cuda path tracer：牛刀小試！

Note