Add aras-p's ToyPathTracer.

https://github.com/aras-p/ToyPathTracer b076563906169aa2f9e6d7218ef85decf81f8f72
2024-11-29 08:54:34 +00:00 · 2019-10-29 22:21:34 +01:00 · 2019-10-29 22:21:34 +01:00 · 0b1eff8b0d
commit 0b1eff8b0d
parent 789b95f259
22 changed files with 3957 additions and 0 deletions
--- a/examples/ToyPathTracer/README
+++ b/examples/ToyPathTracer/README
@ -0,0 +1 @@
 https://github.com/aras-p/ToyPathTracer
--- a/examples/ToyPathTracer/Source/Config.h
+++ b/examples/ToyPathTracer/Source/Config.h
@ -0,0 +1,33 @@
 #if defined(__APPLE__) && !defined(__METAL_VERSION__)
 #include <TargetConditionals.h>
 #endif
 #define kBackbufferWidth 1280
 #define kBackbufferHeight 720
 #if defined(__EMSCRIPTEN__)
 #define CPU_CAN_DO_SIMD 0
 #define CPU_CAN_DO_THREADS 0
 #else
 #define CPU_CAN_DO_SIMD 1
 #define CPU_CAN_DO_THREADS 1
 #endif
 #define DO_SAMPLES_PER_PIXEL 4
 #define DO_ANIMATE_SMOOTHING 0.9f
 #define DO_LIGHT_SAMPLING 1
 #define DO_MITSUBA_COMPARE 0
 // Should path tracing be done on the GPU with a compute shader?
 #define DO_COMPUTE_GPU 0
 #define kCSGroupSizeX 8
 #define kCSGroupSizeY 8
 #define kCSMaxObjects 64
 // Should float3 struct use SSE/NEON?
 #define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && CPU_CAN_DO_SIMD && 1)
 // Should HitSpheres function use SSE/NEON?
 #define DO_HIT_SPHERES_SIMD (CPU_CAN_DO_SIMD && 1)
--- a/examples/ToyPathTracer/Source/MathSimd.h
+++ b/examples/ToyPathTracer/Source/MathSimd.h
@ -0,0 +1,192 @@
 #pragma once
 #if defined(_MSC_VER)
 #define VM_INLINE __forceinline
 #else
 #define VM_INLINE __attribute__((unused, always_inline, nodebug)) inline
 #endif
 #define kSimdWidth 4
 #if !defined(__arm__) && !defined(__arm64__) && !defined(__EMSCRIPTEN__)
 // ---- SSE implementation
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #include <smmintrin.h>
 #define SHUFFLE4(V, X,Y,Z,W) float4(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(W,Z,Y,X)))
 struct float4
 {
    VM_INLINE float4() {}
    VM_INLINE explicit float4(const float *p) { m = _mm_loadu_ps(p); }
    VM_INLINE explicit float4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
    VM_INLINE explicit float4(float v) { m = _mm_set_ps1(v); }
    VM_INLINE explicit float4(__m128 v) { m = v; }
    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
    VM_INLINE float getW() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 3, 3, 3))); }
    __m128 m;
 };
 typedef float4 bool4;
 VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
 VM_INLINE float4 operator- (float4 a, float4 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
 VM_INLINE float4 operator* (float4 a, float4 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = _mm_and_ps(a.m, b.m); return a; }
 VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = _mm_or_ps(a.m, b.m); return a; }
 VM_INLINE float4 operator- (float4 a) { a.m = _mm_xor_ps(a.m, _mm_set1_ps(-0.0f)); return a; }
 VM_INLINE float4 min(float4 a, float4 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
 VM_INLINE float4 max(float4 a, float4 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
 VM_INLINE float hmin(float4 v)
 {
    v = min(v, SHUFFLE4(v, 2, 3, 0, 0));
    v = min(v, SHUFFLE4(v, 1, 0, 0, 0));
    return v.getX();
 }
 // Returns a 4-bit code where bit0..bit3 is X..W
 VM_INLINE unsigned mask(float4 v) { return _mm_movemask_ps(v.m); }
 // Once we have a comparison, we can branch based on its results:
 VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
 VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
 // "select", i.e. hibit(cond) ? b : a
 // on SSE4.1 and up this can be done easily via "blend" instruction;
 // on older SSEs has to do a bunch of hoops, see
 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
 VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
 {
 #if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
    a.m = _mm_blendv_ps(a.m, b.m, cond.m);
 #else
    __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
    a.m = _mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m));
 #endif
    return a;
 }
 VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond)
 {
 #if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
    return _mm_blendv_epi8(a, b, _mm_castps_si128(cond.m));
 #else
    __m128i d = _mm_srai_epi32(_mm_castps_si128(cond.m), 31);
    return _mm_or_si128(_mm_and_si128(d, b), _mm_andnot_si128(d, a));
 #endif
 }
 VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
 #elif !defined(__EMSCRIPTEN__)
 // ---- NEON implementation
 #define USE_NEON 1
 #include <arm_neon.h>
 struct float4
 {
    VM_INLINE float4() {}
    VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); }
    VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); }
    VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); }
    VM_INLINE explicit float4(float32x4_t v) { m = v; }
    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
    VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); }
    float32x4_t m;
 };
 typedef float4 bool4;
 VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; }
 VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; }
 VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; }
 VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; }
 VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
 VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; }
 VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
 VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; }
 VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
 VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; }
 VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; }
 VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; }
 VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; }
 VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
 VM_INLINE float hmin(float4 v)
 {
    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
    return vget_lane_f32(minOfMinOfHalfs, 0);
 }
 // Returns a 4-bit code where bit0..bit3 is X..W
 VM_INLINE unsigned mask(float4 v)
 {
    static const uint32x4_t movemask = { 1, 2, 4, 8 };
    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
    uint32x4_t t1 = vtstq_u32(t0, highbit);
    uint32x4_t t2 = vandq_u32(t1, movemask);
    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
 }
 // Once we have a comparison, we can branch based on its results:
 VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
 VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
 // "select", i.e. hibit(cond) ? b : a
 // on SSE4.1 and up this can be done easily via "blend" instruction;
 // on older SSEs has to do a bunch of hoops, see
 // https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
 VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
 {
    a.m = vbslq_f32(cond.m, b.m, a.m);
    return a;
 }
 VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond)
 {
    return vbslq_f32(cond.m, b, a);
 }
 VM_INLINE float4 sqrtf(float4 v)
 {
    float32x4_t V = v.m;
    float32x4_t S0 = vrsqrteq_f32(V);
    float32x4_t P0 = vmulq_f32( V, S0 );
    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
    float32x4_t S1 = vmulq_f32( S0, R0 );
    float32x4_t P1 = vmulq_f32( V, S1 );
    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
    float32x4_t S2 = vmulq_f32( S1, R1 );
    float32x4_t P2 = vmulq_f32( V, S2 );
    float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
    float32x4_t S3 = vmulq_f32( S2, R2 );
    return float4(vmulq_f32(V, S3));
 }
 VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); }
 VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); }
 VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); }
 VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); }
 #endif
--- a/examples/ToyPathTracer/Source/Maths.cpp
+++ b/examples/ToyPathTracer/Source/Maths.cpp
@ -0,0 +1,203 @@
 #include "Maths.h"
 #include <stdlib.h>
 #include <stdint.h>
 static uint32_t XorShift32(uint32_t& state)
 {
    uint32_t x = state;
    x ^= x << 13;
    x ^= x >> 17;
    x ^= x << 15;
    state = x;
    return x;
 }
 float RandomFloat01(uint32_t& state)
 {
    return (XorShift32(state) & 0xFFFFFF) / 16777216.0f;
 }
 float3 RandomInUnitDisk(uint32_t& state)
 {
    float3 p;
    do
    {
        p = 2.0 * float3(RandomFloat01(state),RandomFloat01(state),0) - float3(1,1,0);
    } while (dot(p,p) >= 1.0);
    return p;
 }
 float3 RandomInUnitSphere(uint32_t& state)
 {
    float3 p;
    do {
        p = 2.0*float3(RandomFloat01(state),RandomFloat01(state),RandomFloat01(state)) - float3(1,1,1);
    } while (sqLength(p) >= 1.0);
    return p;
 }
 float3 RandomUnitVector(uint32_t& state)
 {
    float z = RandomFloat01(state) * 2.0f - 1.0f;
    float a = RandomFloat01(state) * 2.0f * kPI;
    float r = sqrtf(1.0f - z * z);
    float x = r * cosf(a);
    float y = r * sinf(a);
    return float3(x, y, z);
 }
 int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit)
 {
 #if DO_HIT_SPHERES_SIMD
    float4 hitT = float4(tMax);
 #if USE_NEON
    int32x4_t id = vdupq_n_s32(-1);
 #else
    __m128i id = _mm_set1_epi32(-1);
 #endif
 #if DO_FLOAT3_WITH_SIMD && !USE_NEON
    float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0);
    float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1);
    float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2);
    float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0);
    float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1);
    float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2);
 #elif DO_FLOAT3_WITH_SIMD
    float4 rOrigX = splatX(r.orig.m);
    float4 rOrigY = splatY(r.orig.m);
    float4 rOrigZ = splatZ(r.orig.m);
    float4 rDirX = splatX(r.dir.m);
    float4 rDirY = splatY(r.dir.m);
    float4 rDirZ = splatZ(r.dir.m);
 #else
    float4 rOrigX = float4(r.orig.x);
    float4 rOrigY = float4(r.orig.y);
    float4 rOrigZ = float4(r.orig.z);
    float4 rDirX = float4(r.dir.x);
    float4 rDirY = float4(r.dir.y);
    float4 rDirZ = float4(r.dir.z);
 #endif
    float4 tMin4 = float4(tMin);
 #if USE_NEON
    int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32)));
 #else
    __m128i curId = _mm_set_epi32(3, 2, 1, 0);
 #endif
    // process 4 spheres at once
    for (int i = 0; i < spheres.simdCount; i += kSimdWidth)
    {
        // load data for 4 spheres
        float4 sCenterX = float4(spheres.centerX + i);
        float4 sCenterY = float4(spheres.centerY + i);
        float4 sCenterZ = float4(spheres.centerZ + i);
        float4 sSqRadius = float4(spheres.sqRadius + i);
        // note: we flip this vector and calculate -b (nb) since that happens to be slightly preferable computationally
        float4 coX = sCenterX - rOrigX;
        float4 coY = sCenterY - rOrigY;
        float4 coZ = sCenterZ - rOrigZ;
        float4 nb = coX * rDirX + coY * rDirY + coZ * rDirZ;
        float4 c = coX * coX + coY * coY + coZ * coZ - sSqRadius;
        float4 discr = nb * nb - c;
        bool4 discrPos = discr > float4(0.0f);
        // if ray hits any of the 4 spheres
        if (any(discrPos))
        {
            float4 discrSq = sqrtf(discr);
            // ray could hit spheres at t0 & t1
            float4 t0 = nb - discrSq;
            float4 t1 = nb + discrSq;
            float4 t = select(t1, t0, t0 > tMin4); // if t0 is above min, take it (since it's the earlier hit); else try t1.
            bool4 msk = discrPos & (t > tMin4) & (t < hitT);
            // if hit, take it
            id = select(id, curId, msk);
            hitT = select(hitT, t, msk);
        }
 #if USE_NEON
        curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth));
 #else
        curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth));
 #endif
    }
    // now we have up to 4 hits, find and return closest one
    float minT = hmin(hitT);
    if (minT < tMax) // any actual hits?
    {
        int minMask = mask(hitT == float4(minT));
        if (minMask != 0)
        {
            int id_scalar[4];
            float hitT_scalar[4];
 #if USE_NEON
            vst1q_s32(id_scalar, id);
            vst1q_f32(hitT_scalar, hitT.m);
 #else
            _mm_storeu_si128((__m128i *)id_scalar, id);
            _mm_storeu_ps(hitT_scalar, hitT.m);
 #endif
            // In general, you would do this with a bit scan (first set/trailing zero count).
            // But who cares, it's only 16 options.
            static const int laneId[16] =
            {
                0, 0, 1, 0, // 00xx
                2, 0, 1, 0, // 01xx
                3, 0, 1, 0, // 10xx
                2, 0, 1, 0, // 11xx
            };
            int lane = laneId[minMask];
            int hitId = id_scalar[lane];
            float finalHitT = hitT_scalar[lane];
            outHit.pos = r.pointAt(finalHitT);
            outHit.normal = (outHit.pos - float3(spheres.centerX[hitId], spheres.centerY[hitId], spheres.centerZ[hitId])) * spheres.invRadius[hitId];
            outHit.t = finalHitT;
            return hitId;
        }
    }
    return -1;
 #else // #if DO_HIT_SPHERES_SIMD
    float hitT = tMax;
    int id = -1;
    for (int i = 0; i < spheres.count; ++i)
    {
        float coX = spheres.centerX[i] - r.orig.getX();
        float coY = spheres.centerY[i] - r.orig.getY();
        float coZ = spheres.centerZ[i] - r.orig.getZ();
        float nb = coX * r.dir.getX() + coY * r.dir.getY() + coZ * r.dir.getZ();
        float c = coX * coX + coY * coY + coZ * coZ - spheres.sqRadius[i];
        float discr = nb * nb - c;
        if (discr > 0)
        {
            float discrSq = sqrtf(discr);
            // Try earlier t
            float t = nb - discrSq;
            if (t <= tMin) // before min, try later t!
                t = nb + discrSq;
            if (t > tMin && t < hitT)
            {
                id = i;
                hitT = t;
            }
        }
    }
    if (id != -1)
    {
        outHit.pos = r.pointAt(hitT);
        outHit.normal = (outHit.pos - float3(spheres.centerX[id], spheres.centerY[id], spheres.centerZ[id])) * spheres.invRadius[id];
        outHit.t = hitT;
        return id;
    }
    else
        return -1;
 #endif // #else of #if DO_HIT_SPHERES_SIMD
 }
--- a/examples/ToyPathTracer/Source/Maths.h
+++ b/examples/ToyPathTracer/Source/Maths.h
@ -0,0 +1,436 @@
 #pragma once
 #include <math.h>
 #include <assert.h>
 #include <stdint.h>
 #include "Config.h"
 #include "MathSimd.h"
 #define kPI 3.1415926f
 // SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
 #if DO_FLOAT3_WITH_SIMD
 #if !defined(__arm__) && !defined(__arm64__)
 // ---- SSE implementation
 // SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
 // SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
 #define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))
 struct float3
 {
    VM_INLINE float3() {}
    VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
    VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
    VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }
    VM_INLINE explicit float3(__m128 v) { m = v; }
    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
    VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
    VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
    void setX(float x)
    {
        m = _mm_move_ss(m, _mm_set_ss(x));
    }
    void setY(float y)
    {
        __m128 t = _mm_move_ss(m, _mm_set_ss(y));
        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
        m = _mm_move_ss(t, m);
    }
    void setZ(float z)
    {
        __m128 t = _mm_move_ss(m, _mm_set_ss(z));
        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
        m = _mm_move_ss(t, m);
    }
    __m128 m;
 };
 typedef float3 bool3;
 VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
 VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
 VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
 VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
 VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
 VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
 VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
 VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
 VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
 VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
 VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
 VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
 VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
 VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
 VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
 VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
 VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
 VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
 VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
 VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
 VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
 VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
 VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }
 VM_INLINE float hmin(float3 v)
 {
    v = min(v, SHUFFLE3(v, 1, 0, 2));
    return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
 }
 VM_INLINE float hmax(float3 v)
 {
    v = max(v, SHUFFLE3(v, 1, 0, 2));
    return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
 }
 VM_INLINE float3 cross(float3 a, float3 b)
 {
    // x  <-  a.y*b.z - a.z*b.y
    // y  <-  a.z*b.x - a.x*b.z
    // z  <-  a.x*b.y - a.y*b.x
    // We can save a shuffle by grouping it in this wacky order:
    return (a.zxy()*b - a*b.zxy()).zxy();
 }
 // Returns a 3-bit code where bit0..bit2 is X..Z
 VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
 // Once we have a comparison, we can branch based on its results:
 VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
 VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
 VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
 VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
 VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
 #else // #if !defined(__arm__) && !defined(__arm64__)
 // ---- NEON implementation
 #include <arm_neon.h>
 struct float3
 {
    VM_INLINE float3() {}
    VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }
    VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }
    VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }
    VM_INLINE explicit float3(float32x4_t v) { m = v; }
    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
    VM_INLINE float3 yzx() const
    {
        float32x2_t low = vget_low_f32(m);
        float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);
        return float3(yzx);
    }
    VM_INLINE float3 zxy() const
    {
        float32x4_t p = m;
        p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];
        return float3(p);
    }
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
    void setX(float x)
    {
        m = vsetq_lane_f32(x, m, 0);
    }
    void setY(float y)
    {
        m = vsetq_lane_f32(y, m, 1);
    }
    void setZ(float z)
    {
        m = vsetq_lane_f32(z, m, 2);
    }
    float32x4_t m;
 };
 typedef float3 bool3;
 VM_INLINE float32x4_t rcp_2(float32x4_t v)
 {
    float32x4_t e = vrecpeq_f32(v);
    e = vmulq_f32(vrecpsq_f32(e, v), e);
    e = vmulq_f32(vrecpsq_f32(e, v), e);
    return e;
 }
 VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }
 VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }
 VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }
 VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }
 VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }
 VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }
 VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }
 VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }
 VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
 VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
 VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
 VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
 VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
 VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
 VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }
 VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
 VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }
 VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
 VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }
 VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
 VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }
 VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
 VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }
 VM_INLINE float hmin(float3 v)
 {
    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
    return vget_lane_f32(minOfMinOfHalfs, 0);
 }
 VM_INLINE float hmax(float3 v)
 {
    float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);
    return vget_lane_f32(maxOfMaxOfHalfs, 0);
 }
 VM_INLINE float3 cross(float3 a, float3 b)
 {
    // x  <-  a.y*b.z - a.z*b.y
    // y  <-  a.z*b.x - a.x*b.z
    // z  <-  a.x*b.y - a.y*b.x
    // We can save a shuffle by grouping it in this wacky order:
    return (a.zxy()*b - a*b.zxy()).zxy();
 }
 // Returns a 3-bit code where bit0..bit2 is X..Z
 VM_INLINE unsigned mask(float3 v)
 {
    static const uint32x4_t movemask = { 1, 2, 4, 8 };
    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
    uint32x4_t t1 = vtstq_u32(t0, highbit);
    uint32x4_t t2 = vandq_u32(t1, movemask);
    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
 }
 // Once we have a comparison, we can branch based on its results:
 VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
 VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
 VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
 VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
 VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
 #endif // #else of #if !defined(__arm__) && !defined(__arm64__)
 #else // #if DO_FLOAT3_WITH_SIMD
 // ---- Simple scalar C implementation
 struct float3
 {
    float3() : x(0), y(0), z(0) {}
    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
    float3 operator-() const { return float3(-x, -y, -z); }
    float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
    float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
    float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
    float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }
    VM_INLINE float getX() const { return x; }
    VM_INLINE float getY() const { return y; }
    VM_INLINE float getZ() const { return z; }
    VM_INLINE void setX(float x_) { x = x_; }
    VM_INLINE void setY(float y_) { y = y_; }
    VM_INLINE void setZ(float z_) { z = z_; }
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
    float x, y, z;
 };
 VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
 VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
 VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
 VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
 VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
 VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
 VM_INLINE float3 cross(const float3& a, const float3& b)
 {
    return float3(
                  a.y*b.z - a.z*b.y,
                  -(a.x*b.z - a.z*b.x),
                  a.x*b.y - a.y*b.x
                  );
 }
 #endif // #else of #if DO_FLOAT3_WITH_SIMD
 VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
 VM_INLINE float sqLength(float3 v) { return dot(v, v); }
 VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
 VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }
 inline void AssertUnit(float3 v)
 {
    assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
 }
 inline float3 reflect(float3 v, float3 n)
 {
    return v - 2*dot(v,n)*n;
 }
 inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)
 {
    AssertUnit(v);
    float dt = dot(v, n);
    float discr = 1.0f - nint*nint*(1-dt*dt);
    if (discr > 0)
    {
        outRefracted = nint * (v - n*dt) - n*sqrtf(discr);
        return true;
    }
    return false;
 }
 inline float schlick(float cosine, float ri)
 {
    float r0 = (1-ri) / (1+ri);
    r0 = r0*r0;
    return r0 + (1-r0)*powf(1-cosine, 5);
 }
 struct Ray
 {
    Ray() {}
    Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }
    float3 pointAt(float t) const { return orig + dir * t; }
    float3 orig;
    float3 dir;
 };
 struct Hit
 {
    float3 pos;
    float3 normal;
    float t;
 };
 struct Sphere
 {
    Sphere() : radius(1.0f), invRadius(0.0f) {}
    Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}
    void UpdateDerivedData() { invRadius = 1.0f/radius; }
    float3 center;
    float radius;
    float invRadius;
 };
 // data for all spheres in a "structure of arrays" layout
 struct SpheresSoA
 {
    SpheresSoA(int c)
    {
        count = c;
        // we'll be processing spheres in kSimdWidth chunks, so make sure to allocate
        // enough space
        simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;
        centerX = new float[simdCount];
        centerY = new float[simdCount];
        centerZ = new float[simdCount];
        sqRadius = new float[simdCount];
        invRadius = new float[simdCount];
        // set all data to "impossible sphere" state
        for (int i = count; i < simdCount; ++i)
        {
            centerX[i] = centerY[i] = centerZ[i] = 10000.0f;
            sqRadius[i] = 0.0f;
            invRadius[i] = 0.0f;
        }
    }
    ~SpheresSoA()
    {
        delete[] centerX;
        delete[] centerY;
        delete[] centerZ;
        delete[] sqRadius;
        delete[] invRadius;
    }
    float* centerX;
    float* centerY;
    float* centerZ;
    float* sqRadius;
    float* invRadius;
    int simdCount;
    int count;
 };
 int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);
 float RandomFloat01(uint32_t& state);
 float3 RandomInUnitDisk(uint32_t& state);
 float3 RandomInUnitSphere(uint32_t& state);
 float3 RandomUnitVector(uint32_t& state);
 struct Camera
 {
    Camera() {}
    // vfov is top to bottom in degrees
    Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)
    {
        lensRadius = aperture / 2;
        float theta = vfov*kPI/180;
        float halfHeight = tanf(theta/2);
        float halfWidth = aspect * halfHeight;
        origin = lookFrom;
        w = normalize(lookFrom - lookAt);
        u = normalize(cross(vup, w));
        v = cross(w, u);
        lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w;
        horizontal = 2*halfWidth*focusDist*u;
        vertical = 2*halfHeight*focusDist*v;
    }
    Ray GetRay(float s, float t, uint32_t& state) const
    {
        float3 rd = lensRadius * RandomInUnitDisk(state);
        float3 offset = u * rd.getX() + v * rd.getY();
        return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
    }
    float3 origin;
    float3 lowerLeftCorner;
    float3 horizontal;
    float3 vertical;
    float3 u, v, w;
    float lensRadius;
 };
--- a/examples/ToyPathTracer/Source/Test.cpp
+++ b/examples/ToyPathTracer/Source/Test.cpp
@ -0,0 +1,380 @@
 #include "Config.h"
 #include "Test.h"
 #include "Maths.h"
 #include <algorithm>
 #if CPU_CAN_DO_THREADS
 #include "enkiTS/TaskScheduler_c.h"
 #endif
 #include <atomic>
 // 46 spheres (2 emissive) when enabled; 9 spheres (1 emissive) when disabled
 #define DO_BIG_SCENE 1
 static Sphere s_Spheres[] =
 {
    {float3(0,-100.5,-1), 100},
    {float3(2,0,-1), 0.5f},
    {float3(0,0,-1), 0.5f},
    {float3(-2,0,-1), 0.5f},
    {float3(2,0,1), 0.5f},
    {float3(0,0,1), 0.5f},
    {float3(-2,0,1), 0.5f},
    {float3(0.5f,1,0.5f), 0.5f},
    {float3(-1.5f,1.5f,0.f), 0.3f},
 #if DO_BIG_SCENE
    {float3(4,0,-3), 0.5f}, {float3(3,0,-3), 0.5f}, {float3(2,0,-3), 0.5f}, {float3(1,0,-3), 0.5f}, {float3(0,0,-3), 0.5f}, {float3(-1,0,-3), 0.5f}, {float3(-2,0,-3), 0.5f}, {float3(-3,0,-3), 0.5f}, {float3(-4,0,-3), 0.5f},
    {float3(4,0,-4), 0.5f}, {float3(3,0,-4), 0.5f}, {float3(2,0,-4), 0.5f}, {float3(1,0,-4), 0.5f}, {float3(0,0,-4), 0.5f}, {float3(-1,0,-4), 0.5f}, {float3(-2,0,-4), 0.5f}, {float3(-3,0,-4), 0.5f}, {float3(-4,0,-4), 0.5f},
    {float3(4,0,-5), 0.5f}, {float3(3,0,-5), 0.5f}, {float3(2,0,-5), 0.5f}, {float3(1,0,-5), 0.5f}, {float3(0,0,-5), 0.5f}, {float3(-1,0,-5), 0.5f}, {float3(-2,0,-5), 0.5f}, {float3(-3,0,-5), 0.5f}, {float3(-4,0,-5), 0.5f},
    {float3(4,0,-6), 0.5f}, {float3(3,0,-6), 0.5f}, {float3(2,0,-6), 0.5f}, {float3(1,0,-6), 0.5f}, {float3(0,0,-6), 0.5f}, {float3(-1,0,-6), 0.5f}, {float3(-2,0,-6), 0.5f}, {float3(-3,0,-6), 0.5f}, {float3(-4,0,-6), 0.5f},
    {float3(1.5f,1.5f,-2), 0.3f},
 #endif // #if DO_BIG_SCENE
 };
 const int kSphereCount = sizeof(s_Spheres) / sizeof(s_Spheres[0]);
 static SpheresSoA s_SpheresSoA(kSphereCount);
 struct Material
 {
    enum Type { Lambert, Metal, Dielectric };
    Type type;
    float3 albedo;
    float3 emissive;
    float roughness;
    float ri;
 };
 static Material s_SphereMats[kSphereCount] =
 {
    { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, },
    { Material::Lambert, float3(0.8f, 0.4f, 0.4f), float3(0,0,0), 0, 0, },
    { Material::Lambert, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0, },
    { Material::Metal, float3(0.4f, 0.4f, 0.8f), float3(0,0,0), 0, 0 },
    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0 },
    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.2f, 0 },
    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.6f, 0 },
    { Material::Dielectric, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 1.5f },
    { Material::Lambert, float3(0.8f, 0.6f, 0.2f), float3(30,25,15), 0, 0 },
 #if DO_BIG_SCENE
    { Material::Lambert, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
    { Material::Metal, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
    { Material::Metal, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
    { Material::Lambert, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
    { Material::Lambert, float3(0.1f, 0.2f, 0.5f), float3(3,10,20), 0, 0 },
 #endif
 };
 static int s_EmissiveSpheres[kSphereCount];
 static int s_EmissiveSphereCount;
 static Camera s_Cam;
 const float kMinT = 0.001f;
 const float kMaxT = 1.0e7f;
 const int kMaxDepth = 10;
 bool HitWorld(const Ray& r, float tMin, float tMax, Hit& outHit, int& outID)
 {
    outID = HitSpheres(r, s_SpheresSoA, tMin, tMax, outHit);
    return outID != -1;
 }
 static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3& attenuation, Ray& scattered, float3& outLightE, int& inoutRayCount, uint32_t& state)
 {
    outLightE = float3(0,0,0);
    if (mat.type == Material::Lambert)
    {
        // random point on unit sphere that is tangent to the hit point
        float3 target = rec.pos + rec.normal + RandomUnitVector(state);
        scattered = Ray(rec.pos, normalize(target - rec.pos));
        attenuation = mat.albedo;
        // sample lights
 #if DO_LIGHT_SAMPLING
        for (int j = 0; j < s_EmissiveSphereCount; ++j)
        {
            int i = s_EmissiveSpheres[j];
            const Material& smat = s_SphereMats[i];
            if (&mat == &smat)
                continue; // skip self
            const Sphere& s = s_Spheres[i];
            // create a random direction towards sphere
            // coord system for sampling: sw, su, sv
            float3 sw = normalize(s.center - rec.pos);
            float3 su = normalize(cross(fabs(sw.getX())>0.01f ? float3(0,1,0):float3(1,0,0), sw));
            float3 sv = cross(sw, su);
            // sample sphere by solid angle
            float cosAMax = sqrtf(1.0f - s.radius*s.radius / sqLength(rec.pos-s.center));
            float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
            float cosA = 1.0f - eps1 + eps1 * cosAMax;
            float sinA = sqrtf(1.0f - cosA*cosA);
            float phi = 2 * kPI * eps2;
            float3 l = su * (cosf(phi) * sinA) + sv * (sinf(phi) * sinA) + sw * cosA;
            //l = normalize(l); // NOTE(fg): This is already normalized, by construction.
            // shoot shadow ray
            Hit lightHit;
            int hitID;
            ++inoutRayCount;
            if (HitWorld(Ray(rec.pos, l), kMinT, kMaxT, lightHit, hitID) && hitID == i)
            {
                float omega = 2 * kPI * (1-cosAMax);
                float3 rdir = r_in.dir;
                AssertUnit(rdir);
                float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal;
                outLightE += (mat.albedo * smat.emissive) * (std::max(0.0f, dot(l, nl)) * omega / kPI);
            }
        }
 #endif
        return true;
    }
    else if (mat.type == Material::Metal)
    {
        AssertUnit(r_in.dir); AssertUnit(rec.normal);
        float3 refl = reflect(r_in.dir, rec.normal);
        // reflected ray, and random inside of sphere based on roughness
        float roughness = mat.roughness;
 #if DO_MITSUBA_COMPARE
        roughness = 0; // until we get better BRDF for metals
 #endif
        scattered = Ray(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state)));
        attenuation = mat.albedo;
        return dot(scattered.dir, rec.normal) > 0;
    }
    else if (mat.type == Material::Dielectric)
    {
        AssertUnit(r_in.dir); AssertUnit(rec.normal);
        float3 outwardN;
        float3 rdir = r_in.dir;
        float3 refl = reflect(rdir, rec.normal);
        float nint;
        attenuation = float3(1,1,1);
        float3 refr;
        float reflProb;
        float cosine;
        if (dot(rdir, rec.normal) > 0)
        {
            outwardN = -rec.normal;
            nint = mat.ri;
            cosine = mat.ri * dot(rdir, rec.normal);
        }
        else
        {
            outwardN = rec.normal;
            nint = 1.0f / mat.ri;
            cosine = -dot(rdir, rec.normal);
        }
        if (refract(rdir, outwardN, nint, refr))
        {
            reflProb = schlick(cosine, mat.ri);
        }
        else
        {
            reflProb = 1;
        }
        if (RandomFloat01(state) < reflProb)
            scattered = Ray(rec.pos, normalize(refl));
        else
            scattered = Ray(rec.pos, normalize(refr));
    }
    else
    {
        attenuation = float3(1,0,1);
        return false;
    }
    return true;
 }
 static float3 Trace(const Ray& r, int depth, int& inoutRayCount, uint32_t& state, bool doMaterialE = true)
 {
    Hit rec;
    int id = 0;
    ++inoutRayCount;
    if (HitWorld(r, kMinT, kMaxT, rec, id))
    {
        Ray scattered;
        float3 attenuation;
        float3 lightE;
        const Material& mat = s_SphereMats[id];
        float3 matE = mat.emissive;
        if (depth < kMaxDepth && Scatter(mat, r, rec, attenuation, scattered, lightE, inoutRayCount, state))
        {
 #if DO_LIGHT_SAMPLING
            if (!doMaterialE) matE = float3(0,0,0); // don't add material emission if told so
            // dor Lambert materials, we just did explicit light (emissive) sampling and already
            // for their contribution, so if next ray bounce hits the light again, don't add
            // emission
            doMaterialE = (mat.type != Material::Lambert);
 #endif
            return matE + lightE + attenuation * Trace(scattered, depth+1, inoutRayCount, state, doMaterialE);
        }
        else
        {
            return matE;
        }
    }
    else
    {
        // sky
 #if DO_MITSUBA_COMPARE
        return float3(0.15f,0.21f,0.3f); // easier compare with Mitsuba's constant environment light
 #else
        float3 unitDir = r.dir;
        float t = 0.5f*(unitDir.getY() + 1.0f);
        return ((1.0f-t)*float3(1.0f, 1.0f, 1.0f) + t*float3(0.5f, 0.7f, 1.0f)) * 0.3f;
 #endif
    }
 }
 #if CPU_CAN_DO_THREADS
 static enkiTaskScheduler* g_TS;
 #endif
 void InitializeTest()
 {
    #if CPU_CAN_DO_THREADS
    g_TS = enkiNewTaskScheduler();
    enkiInitTaskScheduler(g_TS);
    #endif
 }
 void ShutdownTest()
 {
    #if CPU_CAN_DO_THREADS
    enkiDeleteTaskScheduler(g_TS);
    #endif
 }
 struct JobData
 {
    float time;
    int frameCount;
    int screenWidth, screenHeight;
    float* backbuffer;
    Camera* cam;
    std::atomic<int> rayCount;
    unsigned testFlags;
 };
 static void TraceRowJob(uint32_t start, uint32_t end, uint32_t threadnum, void* data_)
 {
    JobData& data = *(JobData*)data_;
    float* backbuffer = data.backbuffer + start * data.screenWidth * 4;
    float invWidth = 1.0f / data.screenWidth;
    float invHeight = 1.0f / data.screenHeight;
    float lerpFac = float(data.frameCount) / float(data.frameCount+1);
    if (data.testFlags & kFlagAnimate)
        lerpFac *= DO_ANIMATE_SMOOTHING;
    if (!(data.testFlags & kFlagProgressive))
        lerpFac = 0;
    int rayCount = 0;
    for (uint32_t y = start; y < end; ++y)
    {
        uint32_t state = (y * 9781 + data.frameCount * 6271) | 1;
        for (int x = 0; x < data.screenWidth; ++x)
        {
            float3 col(0, 0, 0);
            for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++)
            {
                float u = float(x + RandomFloat01(state)) * invWidth;
                float v = float(y + RandomFloat01(state)) * invHeight;
                Ray r = data.cam->GetRay(u, v, state);
                col += Trace(r, 0, rayCount, state);
            }
            col *= 1.0f / float(DO_SAMPLES_PER_PIXEL);
            float3 prev(backbuffer[0], backbuffer[1], backbuffer[2]);
            col = prev * lerpFac + col * (1-lerpFac);
            col.store(backbuffer);
            backbuffer += 4;
        }
    }
    data.rayCount += rayCount;
 }
 void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags)
 {
    if (testFlags & kFlagAnimate)
    {
        s_Spheres[1].center.setY(cosf(time) + 1.0f);
        s_Spheres[8].center.setZ(sinf(time)*0.3f);
    }
    float3 lookfrom(0, 2, 3);
    float3 lookat(0, 0, 0);
    float distToFocus = 3;
 #if DO_MITSUBA_COMPARE
    float aperture = 0.0f;
 #else
    float aperture = 0.1f;
 #endif
 #if DO_BIG_SCENE
    aperture *= 0.2f;
 #endif
    s_EmissiveSphereCount = 0;
    for (int i = 0; i < kSphereCount; ++i)
    {
        Sphere& s = s_Spheres[i];
        s.UpdateDerivedData();
        s_SpheresSoA.centerX[i] = s.center.getX();
        s_SpheresSoA.centerY[i] = s.center.getY();
        s_SpheresSoA.centerZ[i] = s.center.getZ();
        s_SpheresSoA.sqRadius[i] = s.radius * s.radius;
        s_SpheresSoA.invRadius[i] = s.invRadius;
        // Remember IDs of emissive spheres (light sources)
        const Material& smat = s_SphereMats[i];
        if (smat.emissive.getX() > 0 || smat.emissive.getY() > 0 || smat.emissive.getZ() > 0)
        {
            s_EmissiveSpheres[s_EmissiveSphereCount] = i;
            s_EmissiveSphereCount++;
        }
    }
    s_Cam = Camera(lookfrom, lookat, float3(0, 1, 0), 60, float(screenWidth) / float(screenHeight), aperture, distToFocus);
 }
 void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags)
 {
    JobData args;
    args.time = time;
    args.frameCount = frameCount;
    args.screenWidth = screenWidth;
    args.screenHeight = screenHeight;
    args.backbuffer = backbuffer;
    args.cam = &s_Cam;
    args.testFlags = testFlags;
    args.rayCount = 0;
    #if CPU_CAN_DO_THREADS
    enkiTaskSet* task = enkiCreateTaskSet(g_TS, TraceRowJob);
    bool threaded = true;
    enkiAddTaskSetToPipeMinRange(g_TS, task, &args, screenHeight, threaded ? 4 : screenHeight);
    enkiWaitForTaskSet(g_TS, task);
    enkiDeleteTaskSet(task);
    #else
    TraceRowJob(0, screenHeight, 0, &args);
    #endif
    outRayCount = args.rayCount;
 }
 void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize)
 {
    outCount = kSphereCount;
    outObjectSize = sizeof(Sphere);
    outMaterialSize = sizeof(Material);
    outCamSize = sizeof(Camera);
 }
 void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount)
 {
    memcpy(outObjects, s_Spheres, kSphereCount * sizeof(s_Spheres[0]));
    memcpy(outMaterials, s_SphereMats, kSphereCount * sizeof(s_SphereMats[0]));
    memcpy(outCam, &s_Cam, sizeof(s_Cam));
    memcpy(outEmissives, s_EmissiveSpheres, s_EmissiveSphereCount * sizeof(s_EmissiveSpheres[0]));
    *outEmissiveCount = s_EmissiveSphereCount;
 }
--- a/examples/ToyPathTracer/Source/Test.h
+++ b/examples/ToyPathTracer/Source/Test.h
@ -0,0 +1,17 @@
 #pragma once
 #include <stdint.h>
 enum TestFlags
 {
    kFlagAnimate = (1 << 0),
    kFlagProgressive = (1 << 1),
 };
 void InitializeTest();
 void ShutdownTest();
 void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags);
 void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags);
 void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize);
 void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount);
--- a/examples/ToyPathTracer/Source/enkiTS/Atomics.h
+++ b/examples/ToyPathTracer/Source/enkiTS/Atomics.h
@ -0,0 +1,79 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #pragma once
 #include <stdint.h>
 #ifdef _WIN32
    #define WIN32_LEAN_AND_MEAN
    #include <Windows.h>
 	#undef GetObject
    #include <intrin.h>
    extern "C" void _ReadWriteBarrier();
    #pragma intrinsic(_ReadWriteBarrier)
    #pragma intrinsic(_InterlockedCompareExchange)
    #pragma intrinsic(_InterlockedExchangeAdd)
    // Memory Barriers to prevent CPU and Compiler re-ordering
    #define BASE_MEMORYBARRIER_ACQUIRE() _ReadWriteBarrier()
    #define BASE_MEMORYBARRIER_RELEASE() _ReadWriteBarrier()
    #define BASE_ALIGN(x) __declspec( align( x ) ) 
 #else
    #define BASE_MEMORYBARRIER_ACQUIRE() __asm__ __volatile__("": : :"memory")  
    #define BASE_MEMORYBARRIER_RELEASE() __asm__ __volatile__("": : :"memory")  
 	#define BASE_ALIGN(x)  __attribute__ ((aligned( x )))
 #endif
 namespace enki
 {
    // Atomically performs: if( *pDest == compareWith ) { *pDest = swapTo; }
    // returns old *pDest (so if successfull, returns compareWith)
    inline uint32_t AtomicCompareAndSwap( volatile uint32_t* pDest, uint32_t swapTo, uint32_t compareWith )
    {
       #ifdef _WIN32
 			// assumes two's complement - unsigned / signed conversion leads to same bit pattern
            return _InterlockedCompareExchange( (volatile long*)pDest,swapTo, compareWith );
        #else
            return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
        #endif      
    }
    inline uint64_t AtomicCompareAndSwap( volatile uint64_t* pDest, uint64_t swapTo, uint64_t compareWith )
    {
       #ifdef _WIN32
 			// assumes two's complement - unsigned / signed conversion leads to same bit pattern
            return _InterlockedCompareExchange64( (__int64 volatile*)pDest, swapTo, compareWith );
        #else
            return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
        #endif      
    }	
    // Atomically performs: tmp = *pDest; *pDest += value; return tmp;
    inline int32_t AtomicAdd( volatile int32_t* pDest, int32_t value )
    {
       #ifdef _WIN32
            return _InterlockedExchangeAdd( (long*)pDest, value );
        #else
            return __sync_fetch_and_add( pDest, value );
        #endif      
    }
 }
--- a/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h
+++ b/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h
@ -0,0 +1,240 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #pragma once
 #include <stdint.h>
 #include <assert.h>
 #include "Atomics.h"
 #include <string.h>
 namespace enki
 {
    // LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming
    // Readers can only read from the back of the pipe
    // The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader)
    // for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx
    // Note: using log2 sizes so we do not need to clamp (multi-operation)
    // T is the contained type
    // Note this is not true lockless as the use of flags as a form of lock state.
    template<uint8_t cSizeLog2, typename T> class LockLessMultiReadPipe
    {
    public:
        LockLessMultiReadPipe();
        ~LockLessMultiReadPipe() {}
        // ReaderTryReadBack returns false if we were unable to read
        // This is thread safe for both multiple readers and the writer
        bool ReaderTryReadBack(   T* pOut );
        // WriterTryReadFront returns false if we were unable to read
        // This is thread safe for the single writer, but should not be called by readers
        bool WriterTryReadFront(  T* pOut );
        // WriterTryWriteFront returns false if we were unable to write
        // This is thread safe for the single writer, but should not be called by readers
        bool WriterTryWriteFront( const T& in );
        // IsPipeEmpty() is a utility function, not intended for general use
        // Should only be used very prudently.
        bool IsPipeEmpty() const
        {
            return 0 == m_WriteIndex - m_ReadCount;
        }
        void Clear()
        {
            m_WriteIndex = 0;
            m_ReadIndex = 0;
            m_ReadCount = 0;
            memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
        }
    private:
        const static uint32_t           ms_cSize        = ( 1 << cSizeLog2 );
        const static uint32_t           ms_cIndexMask   = ms_cSize - 1;
        const static uint32_t           FLAG_INVALID    = 0xFFFFFFFF; // 32bit for CAS
        const static uint32_t           FLAG_CAN_WRITE  = 0x00000000; // 32bit for CAS
        const static uint32_t           FLAG_CAN_READ   = 0x11111111; // 32bit for CAS
        T                               m_Buffer[ ms_cSize ];
        // read and write indexes allow fast access to the pipe, but actual access
        // controlled by the access flags. 
        volatile uint32_t BASE_ALIGN(4) m_WriteIndex;
        volatile uint32_t BASE_ALIGN(4) m_ReadCount;
        volatile uint32_t               m_Flags[  ms_cSize ];
        volatile uint32_t BASE_ALIGN(4) m_ReadIndex;
    };
    template<uint8_t cSizeLog2, typename T> inline
        LockLessMultiReadPipe<cSizeLog2,T>::LockLessMultiReadPipe()
        : m_WriteIndex(0)
        , m_ReadIndex(0)
        , m_ReadCount(0)
    {
        assert( cSizeLog2 < 32 );
        memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
    }
    template<uint8_t cSizeLog2, typename T> inline
        bool LockLessMultiReadPipe<cSizeLog2,T>::ReaderTryReadBack(   T* pOut )
    {
        uint32_t actualReadIndex;
        uint32_t readCount  = m_ReadCount;
        // We get hold of read index for consistency,
        // and do first pass starting at read count
        uint32_t readIndexToUse  = readCount;
        while(true)
        {
            uint32_t writeIndex = m_WriteIndex;
            // power of two sizes ensures we can use a simple calc without modulus
            uint32_t numInPipe = writeIndex - readCount;
            if( 0 == numInPipe )
            {
                return false;
            }
            if( readIndexToUse >= writeIndex )
            {
                // move back to start
                readIndexToUse = m_ReadIndex;
            }
            // power of two sizes ensures we can perform AND for a modulus
            actualReadIndex    = readIndexToUse & ms_cIndexMask;
            // Multiple potential readers mean we should check if the data is valid,
            // using an atomic compare exchange
            uint32_t previous = AtomicCompareAndSwap( &m_Flags[  actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
            if( FLAG_CAN_READ == previous )
            {
                break;
            }
            ++readIndexToUse;
            //update known readcount
            readCount  = m_ReadCount;
        }
        // we update the read index using an atomic add, as we've only read one piece of data.
        // this ensure consistency of the read index, and the above loop ensures readers
        // only read from unread data
        AtomicAdd(  (volatile int32_t*)&m_ReadCount, 1 );
        BASE_MEMORYBARRIER_ACQUIRE();
        // now read data, ensuring we do so after above reads & CAS
        *pOut = m_Buffer[ actualReadIndex ];
        m_Flags[  actualReadIndex ] = FLAG_CAN_WRITE;
        return true;
    }
    template<uint8_t cSizeLog2, typename T> inline
        bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryReadFront(  T* pOut )
    {
        uint32_t writeIndex = m_WriteIndex;
        uint32_t frontReadIndex  = writeIndex;
        // Multiple potential readers mean we should check if the data is valid,
        // using an atomic compare exchange - which acts as a form of lock (so not quite lockless really).
        uint32_t previous = FLAG_INVALID;
        uint32_t actualReadIndex = 0;
        while( true )
        {
            // power of two sizes ensures we can use a simple calc without modulus
            uint32_t readCount = m_ReadCount;
            uint32_t numInPipe = writeIndex - readCount;
            if( 0 == numInPipe || 0 == frontReadIndex )
            {
                // frontReadIndex can get to 0 here if that item was just being read by another thread.
                m_ReadIndex = readCount;
                return false;
            }
            --frontReadIndex;
            actualReadIndex = frontReadIndex & ms_cIndexMask;
            previous = AtomicCompareAndSwap( &m_Flags[  actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
            if( FLAG_CAN_READ == previous )
            {
                break;
            }
            else if( m_ReadIndex >= frontReadIndex  )
            {
                return false;
            }
        }
        // now read data, ensuring we do so after above reads & CAS
        *pOut = m_Buffer[ actualReadIndex ];
        m_Flags[  actualReadIndex ] = FLAG_CAN_WRITE;
        BASE_MEMORYBARRIER_RELEASE();
        // 32-bit aligned stores are atomic, and writer owns the write index
        // we only move one back as this is as many as we have read, not where we have read from.
        --m_WriteIndex;
        return true;
    }
    template<uint8_t cSizeLog2, typename T> inline
        bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryWriteFront( const T& in )
    {
        // The writer 'owns' the write index, and readers can only reduce
        // the amount of data in the pipe.
        // We get hold of both values for consistency and to reduce false sharing
        // impacting more than one access
        uint32_t writeIndex = m_WriteIndex;
        // power of two sizes ensures we can perform AND for a modulus
        uint32_t actualWriteIndex    = writeIndex & ms_cIndexMask;
        // a reader may still be reading this item, as there are multiple readers
        if( m_Flags[ actualWriteIndex ] != FLAG_CAN_WRITE ) 
        {
            return false; // still being read, so have caught up with tail. 
        }
        // as we are the only writer we can update the data without atomics
        //  whilst the write index has not been updated
        m_Buffer[ actualWriteIndex ] = in;
        m_Flags[  actualWriteIndex ] = FLAG_CAN_READ;
        // We need to ensure the above writes occur prior to updating the write index,
        // otherwise another thread might read before it's finished
        BASE_MEMORYBARRIER_RELEASE();
        // 32-bit aligned stores are atomic, and the writer controls the write index
        ++writeIndex;
        m_WriteIndex = writeIndex;
        return true;
    }
 }
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp
@ -0,0 +1,437 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #include <assert.h>
 #include "TaskScheduler.h"
 #include "LockLessMultiReadPipe.h"
 using namespace enki;
 static const uint32_t PIPESIZE_LOG2              = 8;
 static const uint32_t SPIN_COUNT                 = 100;
 static const uint32_t SPIN_BACKOFF_MULTIPLIER    = 10;
 static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8;
 // each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable
 static THREAD_LOCAL uint32_t                             gtl_threadNum       = 0;
 namespace enki 
 {
 	struct SubTaskSet
 	{
 		ITaskSet*           pTask;
 		TaskSetPartition    partition;
 	};
 	// we derive class TaskPipe rather than typedef to get forward declaration working easily
 	class TaskPipe : public LockLessMultiReadPipe<PIPESIZE_LOG2,enki::SubTaskSet> {};
 	struct ThreadArgs
 	{
 		uint32_t		threadNum;
 		TaskScheduler*  pTaskScheduler;
 	};
 }
 namespace
 {
 	SubTaskSet       SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ )
 	{
 		SubTaskSet splitTask = subTask_;
 		uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start;
        if( rangeToSplit_ > rangeLeft )
        {
            rangeToSplit_ = rangeLeft;
        }
        splitTask.partition.end = subTask_.partition.start + rangeToSplit_;
 		subTask_.partition.start = splitTask.partition.end;
 		return splitTask;
 	}
 	#if defined _WIN32
 		#if defined _M_IX86  || defined _M_X64
 			#pragma intrinsic(_mm_pause)
 			inline void Pause() { _mm_pause(); }
 		#endif
 	#elif defined __i386__ || defined __x86_64__
 		inline void Pause() { __asm__ __volatile__("pause;"); }
 	#else
 		inline void Pause() { ;} // may have NOP or yield equiv
 	#endif
 }
 static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_)
 {
 	if( func_ )
 	{
 		func_(threadnum_);
 	}
 }
 ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks()
 {
 	return &m_ProfilerCallbacks;
 }
 THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs )
 {
 	ThreadArgs args					= *(ThreadArgs*)pArgs;
 	uint32_t threadNum				= args.threadNum;
 	TaskScheduler*  pTS				= args.pTaskScheduler;
    gtl_threadNum      = threadNum;
 	SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum );
    uint32_t spinCount = 0;
 	uint32_t hintPipeToCheck_io = threadNum + 1;	// does not need to be clamped.
    while( pTS->m_bRunning )
    {
        if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) )
        {
            // no tasks, will spin then wait
            ++spinCount;
            if( spinCount > SPIN_COUNT )
            {
 				pTS->WaitForTasks( threadNum );
 				spinCount = 0;
            }
 			else
 			{
 				uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER;
 				while( spinBackoffCount )
 				{
 					Pause();
 					--spinBackoffCount;
 				}
 			}
        }
        else
        {
            spinCount = 0;
        }
    }
    AtomicAdd( &pTS->m_NumThreadsRunning, -1 );
 	SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum );
    return 0;
 }
 void TaskScheduler::StartThreads()
 {
    if( m_bHaveThreads )
    {
        return;
    }
    m_bRunning = true;
    SemaphoreCreate( m_NewTaskSemaphore );
    // we create one less thread than m_NumThreads as the main thread counts as one
    m_pThreadNumStore = new ThreadArgs[m_NumThreads];
    m_pThreadIDs      = new threadid_t[m_NumThreads];
 	m_pThreadNumStore[0].threadNum      = 0;
 	m_pThreadNumStore[0].pTaskScheduler = this;
 	m_pThreadIDs[0] = 0;
    m_NumThreadsWaiting = 0;
    m_NumThreadsRunning = 1;// acount for main thread
    for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
    {
 		m_pThreadNumStore[thread].threadNum      = thread;
 		m_pThreadNumStore[thread].pTaskScheduler = this;
        ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] );
        ++m_NumThreadsRunning;
    }
    // ensure we have sufficient tasks to equally fill either all threads including main
    // or just the threads we've launched, this is outside the firstinit as we want to be able
    // to runtime change it
 	if( 1 == m_NumThreads )
 	{
 		m_NumPartitions = 1;
 		m_NumInitialPartitions = 1;
 	}
 	else
 	{
 		m_NumPartitions = m_NumThreads * (m_NumThreads - 1);
 		m_NumInitialPartitions = m_NumThreads - 1;
 		if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS )
 		{
 			m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS;
 		}
 	}
    m_bHaveThreads = true;
 }
 void TaskScheduler::StopThreads( bool bWait_ )
 {
    if( m_bHaveThreads )
    {
        // wait for them threads quit before deleting data
        m_bRunning = false;
        while( bWait_ && m_NumThreadsRunning > 1 )
        {
            // keep firing event to ensure all threads pick up state of m_bRunning
            SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning );
        }
        for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
        {
            ThreadTerminate( m_pThreadIDs[thread] );
        }
 		m_NumThreads = 0;
        delete[] m_pThreadNumStore;
        delete[] m_pThreadIDs;
        m_pThreadNumStore = 0;
        m_pThreadIDs = 0;
        SemaphoreClose( m_NewTaskSemaphore );
        m_bHaveThreads = false;
 		m_NumThreadsWaiting = 0;
 		m_NumThreadsRunning = 0;
    }
 }
 bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ )
 {
    // check for tasks
    SubTaskSet subTask;
    bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask );
 	uint32_t threadToCheck = hintPipeToCheck_io_;
 	uint32_t checkCount = 0;
    while( !bHaveTask && checkCount < m_NumThreads )
    {
 		threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads;
 		if( threadToCheck != threadNum )
 		{
 			bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask );
 		}
 		++checkCount;
    }
    if( bHaveTask )
    {
 		// update hint, will preserve value unless actually got task from another thread.
 		hintPipeToCheck_io_ = threadToCheck;
 		uint32_t partitionSize = subTask.partition.end - subTask.partition.start;
 		if( subTask.pTask->m_RangeToRun < partitionSize )
 		{
 			SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun );
 			SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 );
 			taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum );
 			AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 );
 		}
 		else
 		{
 			// the task has already been divided up by AddTaskSetToPipe, so just run it
 			subTask.pTask->ExecuteRange( subTask.partition, threadNum );
 			AtomicAdd( &subTask.pTask->m_RunningCount, -1 );
 		}
    }
    return bHaveTask;
 }
 void TaskScheduler::WaitForTasks( uint32_t threadNum )
 {
 	// We incrememt the number of threads waiting here in order
 	// to ensure that the check for tasks occurs after the increment
 	// to prevent a task being added after a check, then the thread waiting.
 	// This will occasionally result in threads being mistakenly awoken,
 	// but they will then go back to sleep.
 	AtomicAdd( &m_NumThreadsWaiting, 1 );
    bool bHaveTasks = false;
    for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
    {
        if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
        {
            bHaveTasks = true;
            break;
        }
    }
    if( !bHaveTasks )
    {
        SafeCallback( m_ProfilerCallbacks.waitStart, threadNum );
        SemaphoreWait( m_NewTaskSemaphore );
        SafeCallback( m_ProfilerCallbacks.waitStop, threadNum );
    }
    int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 );
    assert( prev != 0 );
 }
 void TaskScheduler::WakeThreads()
 {
 	SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting );
 }
 void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
 	uint32_t rangeToSplit_, int32_t runningCountOffset_ )
 {
    int32_t numAdded = 0;
    while( subTask_.partition.start != subTask_.partition.end )
    {
        SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ );
        // add the partition to the pipe
        ++numAdded;
        if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) )
        {
 			if( numAdded > 1 )
 			{
 				WakeThreads();
 			}
 			// alter range to run the appropriate fraction
 			if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ )
 			{
 				taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun;
 				subTask_.partition.start = taskToAdd.partition.end;
 			}
            taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ );
            --numAdded;
        }
    }
    // increment running count by number added
    AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ );
 	WakeThreads();
 }
 void    TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet )
 {
 	// set running count to -1 to guarantee it won't be found complete until all subtasks added
    pTaskSet->m_RunningCount = -1;
    // divide task up and add to pipe
    pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions;
    if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; }
 	uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions;
 	if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; }
    SubTaskSet subTask;
    subTask.pTask = pTaskSet;
    subTask.partition.start = 0;
    subTask.partition.end = pTaskSet->m_SetSize;
 	SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 );
 }
 void    TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet )
 {
 	uint32_t hintPipeToCheck_io = gtl_threadNum + 1;	// does not need to be clamped.
 	if( pTaskSet )
 	{
 		while( pTaskSet->m_RunningCount )
 		{
 			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
 			// should add a spin then wait for task completion event.
 		}
 	}
 	else
 	{
 			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
 	}
 }
 void    TaskScheduler::WaitforAll()
 {
    bool bHaveTasks = true;
 	uint32_t hintPipeToCheck_io = gtl_threadNum  + 1;	// does not need to be clamped.
 	int32_t threadsRunning = m_NumThreadsRunning - 1;
    while( bHaveTasks || m_NumThreadsWaiting < threadsRunning )
    {
        TryRunTask( gtl_threadNum, hintPipeToCheck_io );
        bHaveTasks = false;
        for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
        {
            if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
            {
                bHaveTasks = true;
                break;
            }
        }
     }
 }
 void    TaskScheduler::WaitforAllAndShutdown()
 {
    WaitforAll();
    StopThreads(true);
 	delete[] m_pPipesPerThread;
    m_pPipesPerThread = 0;
 }
 uint32_t        TaskScheduler::GetNumTaskThreads() const
 {
    return m_NumThreads;
 }
 TaskScheduler::TaskScheduler()
 		: m_pPipesPerThread(NULL)
 		, m_NumThreads(0)
 		, m_pThreadNumStore(NULL)
 		, m_pThreadIDs(NULL)
 		, m_bRunning(false)
 		, m_NumThreadsRunning(0)
 		, m_NumThreadsWaiting(0)
 		, m_NumPartitions(0)
 		, m_bHaveThreads(false)
 {
 	memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks));
 }
 TaskScheduler::~TaskScheduler()
 {
    StopThreads( true ); // Stops threads, waiting for them.
    delete[] m_pPipesPerThread;
    m_pPipesPerThread = 0;
 }
 void    TaskScheduler::Initialize( uint32_t numThreads_ )
 {
 	assert( numThreads_ );
    StopThreads( true ); // Stops threads, waiting for them.
    delete[] m_pPipesPerThread;
 	m_NumThreads = numThreads_;
    m_pPipesPerThread = new TaskPipe[ m_NumThreads ];
    StartThreads();
 }
 void   TaskScheduler::Initialize()
 {
 	Initialize( GetNumHardwareThreads() );
 }
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h
@ -0,0 +1,177 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #pragma once
 #include <stdint.h>
 #include "Threads.h"
 namespace enki
 {
 	struct TaskSetPartition
 	{
 		uint32_t start;
 		uint32_t end;
 	};
 	class  TaskScheduler;
 	class  TaskPipe;
 	struct ThreadArgs;
 	struct SubTaskSet;
 	// Subclass ITaskSet to create tasks.
 	// TaskSets can be re-used, but check
 	class ITaskSet
 	{
 	public:
        ITaskSet()
            : m_SetSize(1)
 			, m_MinRange(1)
            , m_RunningCount(0)
 			, m_RangeToRun(1)
        {}
        ITaskSet( uint32_t setSize_ )
            : m_SetSize( setSize_ )
 			, m_MinRange(1)
            , m_RunningCount(0)
 			, m_RangeToRun(1)
        {}
 		ITaskSet( uint32_t setSize_, uint32_t minRange_ )
            : m_SetSize( setSize_ )
 			, m_MinRange( minRange_ )
            , m_RunningCount(0)
 			, m_RangeToRun(minRange_)
        {}
 		// Execute range should be overloaded to process tasks. It will be called with a
 		// range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize;
 		// The range values should be mapped so that linearly processing them in order is cache friendly
 		// i.e. neighbouring values should be close together.
 		// threadnum should not be used for changing processing of data, it's intended purpose
 		// is to allow per-thread data buckets for output.
 		virtual void            ExecuteRange( TaskSetPartition range, uint32_t threadnum  ) = 0;
 		// Size of set - usually the number of data items to be processed, see ExecuteRange. Defaults to 1
 		uint32_t                m_SetSize;
 		// Minimum size of of TaskSetPartition range when splitting a task set into partitions.
 		// This should be set to a value which results in computation effort of at least 10k
 		// clock cycles to minimize tast scheduler overhead.
 		// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
 		// of m_MinRange.
 		// Also known as grain size in literature.
 		uint32_t                m_MinRange;
 		bool                    GetIsComplete()
 		{
 			return 0 == m_RunningCount;
 		}
 	private:
 		friend class           TaskScheduler;
 		volatile int32_t        m_RunningCount;
 		uint32_t                m_RangeToRun;
 	};
 	// TaskScheduler implements several callbacks intended for profilers
 	typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ );
 	struct ProfilerCallbacks
 	{
 		ProfilerCallbackFunc threadStart;
 		ProfilerCallbackFunc threadStop;
 		ProfilerCallbackFunc waitStart;
 		ProfilerCallbackFunc waitStop;
 	};
 	class TaskScheduler
 	{
 	public:
 		TaskScheduler();
 		~TaskScheduler();
 		// Call either Initialize() or Initialize( numThreads_ ) before adding tasks.
 		// Initialize() will create GetNumHardwareThreads()-1 threads, which is
 		// sufficient to fill the system when including the main thread.
 		// Initialize can be called multiple times - it will wait for completion
 		// before re-initializing.
 		void			Initialize();
 		// Initialize( numThreads_ ) - numThreads_ (must be > 0)
 		// will create numThreads_-1 threads, as thread 0 is
 		// the thread on which the initialize was called.
 		void			Initialize( uint32_t numThreads_ );
 		// Adds the TaskSet to pipe and returns if the pipe is not full.
 		// If the pipe is full, pTaskSet is run.
 		// should only be called from main thread, or within a task
 		void            AddTaskSetToPipe( ITaskSet* pTaskSet );
 		// Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete();
 		// should only be called from thread which created the taskscheduler , or within a task
 		// if called with 0 it will try to run tasks, and return if none available.
 		void            WaitforTaskSet( const ITaskSet* pTaskSet );
 		// Waits for all task sets to complete - not guaranteed to work unless we know we
 		// are in a situation where tasks aren't being continuosly added.
 		void            WaitforAll();
 		// Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we
 		// are in a situation where tasks aren't being continuosly added.
 		void            WaitforAllAndShutdown();
 		// Returns the number of threads created for running tasks + 1
 		// to account for the main thread.
 		uint32_t        GetNumTaskThreads() const;
 		// Returns the ProfilerCallbacks structure so that it can be modified to
 		// set the callbacks.
 		ProfilerCallbacks* GetProfilerCallbacks();
 	private:
 		static THREADFUNC_DECL  TaskingThreadFunction( void* pArgs );
        void             WaitForTasks( uint32_t threadNum );
 		bool             TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ );
 		void             StartThreads();
 		void             StopThreads( bool bWait_ );
 		void             SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
 										  uint32_t rangeToSplit_, int32_t runningCountOffset_ );
 		void             WakeThreads();
 		TaskPipe*                                                m_pPipesPerThread;
 		uint32_t                                                 m_NumThreads;
 		ThreadArgs*                                              m_pThreadNumStore;
 		threadid_t*                                              m_pThreadIDs;
 		volatile bool                                            m_bRunning;
 		volatile int32_t                                         m_NumThreadsRunning;
 		volatile int32_t                                         m_NumThreadsWaiting;
 		uint32_t                                                 m_NumPartitions;
 		uint32_t                                                 m_NumInitialPartitions;
 		semaphoreid_t                                            m_NewTaskSemaphore;
 		bool                                                     m_bHaveThreads;
 		ProfilerCallbacks										 m_ProfilerCallbacks;
 		TaskScheduler( const TaskScheduler& nocopy );
 		TaskScheduler& operator=( const TaskScheduler& nocopy );
 	};
 }
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp
@ -0,0 +1,122 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #include "TaskScheduler_c.h"
 #include "TaskScheduler.h"
 #include <assert.h>
 using namespace enki;
 struct enkiTaskScheduler : TaskScheduler
 {
 };
 struct enkiTaskSet : ITaskSet
 {
 	enkiTaskSet( enkiTaskExecuteRange taskFun_ ) : taskFun(taskFun_), pArgs(NULL) {}
 	virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum  )
 	{
 		taskFun( range.start, range.end, threadnum, pArgs );
 	}
 	enkiTaskExecuteRange taskFun;
 	void* pArgs;
 };
 enkiTaskScheduler*	enkiNewTaskScheduler()
 {
 	enkiTaskScheduler* pETS = new enkiTaskScheduler();
    return pETS;
 }
 void	            enkiInitTaskScheduler(  enkiTaskScheduler* pETS_ )
 {
    pETS_->Initialize();
 }
 void	            enkiInitTaskSchedulerNumThreads(  enkiTaskScheduler* pETS_, uint32_t numThreads_ )
 {
    pETS_->Initialize( numThreads_ );
 }
 void				enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ )
 {
 	delete pETS_;
 }
 enkiTaskSet*		enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_  )
 {
 	return new enkiTaskSet( taskFunc_ );
 }
 void                enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ )
 {
 	delete pTaskSet_;
 }
 void				enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, void* pArgs_, uint32_t setSize_ )
 {
 	assert( pTaskSet_ );
 	assert( pTaskSet_->taskFun );
 	pTaskSet_->m_SetSize = setSize_;
 	pTaskSet_->pArgs = pArgs_;
 	pETS_->AddTaskSetToPipe( pTaskSet_ );
 }
 void enkiAddTaskSetToPipeMinRange(enkiTaskScheduler * pETS_, enkiTaskSet * pTaskSet_, void * pArgs_, uint32_t setSize_, uint32_t minRange_)
 {
 	assert( pTaskSet_ );
 	assert( pTaskSet_->taskFun );
 	pTaskSet_->m_SetSize = setSize_;
 	pTaskSet_->m_MinRange = minRange_;
 	pTaskSet_->pArgs = pArgs_;
 	pETS_->AddTaskSetToPipe( pTaskSet_ );
 }
 int				enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
 {
 	assert( pTaskSet_ );
 	return ( pTaskSet_->GetIsComplete() ) ? 1 : 0;
 }
 void				enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
 {
 	pETS_->WaitforTaskSet( pTaskSet_ );
 }
 void				enkiWaitForAll( enkiTaskScheduler* pETS_ )
 {
 	pETS_->WaitforAll();
 }
 uint32_t			enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ )
 {
 	return pETS_->GetNumTaskThreads();
 }
 enkiProfilerCallbacks*	enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ )
 {
    assert( sizeof(enkiProfilerCallbacks) == sizeof(enki::ProfilerCallbacks) );
    return (enkiProfilerCallbacks*)pETS_->GetProfilerCallbacks();
 }
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h
@ -0,0 +1,104 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #pragma once
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <stdint.h>
 typedef struct enkiTaskScheduler enkiTaskScheduler;
 typedef struct enkiTaskSet		 enkiTaskSet;
 typedef void (* enkiTaskExecuteRange)( uint32_t start_, uint32_t end, uint32_t threadnum_, void* pArgs_ );
 // Create a new task scheduler
 enkiTaskScheduler*	enkiNewTaskScheduler();
 // Initialize task scheduler - will create GetNumHardwareThreads()-1 threads, which is
 // sufficient to fill the system when including the main thread.
 // Initialize can be called multiple times - it will wait for completion
 // before re-initializing.
 void	            enkiInitTaskScheduler(  enkiTaskScheduler* pETS_ );
 // Initialize a task scheduler with numThreads_ (must be > 0)
 // will create numThreads_-1 threads, as thread 0 is
 // the thread on which the initialize was called.
 void	            enkiInitTaskSchedulerNumThreads(  enkiTaskScheduler* pETS_, uint32_t numThreads_ );
 // Delete a task scheduler
 void				enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ );
 // Create a task set.
 enkiTaskSet*		enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_  );
 // Delete a task set.
 void                enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ );
 // Schedule the task
 void				enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
 										   void* pArgs_, uint32_t setSize_ );
 // Schedule the task with a minimum range.
 // This should be set to a value which results in computation effort of at least 10k
 // clock cycles to minimize tast scheduler overhead.
 // NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
 // of m_MinRange.
 // Also known as grain size in literature.
 void				enkiAddTaskSetToPipeMinRange( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
 												  void* pArgs_, uint32_t setSize_, uint32_t minRange_ );
 // Check if TaskSet is complete. Doesn't wait. Returns 1 if complete, 0 if not.
 int					enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
 // Wait for a given task.
 // should only be called from thread which created the taskscheduler , or within a task
 // if called with 0 it will try to run tasks, and return if none available.
 void				enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
 // Waits for all task sets to complete - not guaranteed to work unless we know we
 // are in a situation where tasks aren't being continuosly added.
 void				enkiWaitForAll( enkiTaskScheduler* pETS_ );
 // get number of threads
 uint32_t			enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ );
 // TaskScheduler implements several callbacks intended for profilers
 typedef void (*enkiProfilerCallbackFunc)( uint32_t threadnum_ );
 struct enkiProfilerCallbacks
 {
    enkiProfilerCallbackFunc threadStart;
    enkiProfilerCallbackFunc threadStop;
    enkiProfilerCallbackFunc waitStart;
    enkiProfilerCallbackFunc waitStop;
 };
 // Get the callback structure so it can be set 
 struct enkiProfilerCallbacks*	enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ );
 #ifdef __cplusplus
 }
 #endif
--- a/examples/ToyPathTracer/Source/enkiTS/Threads.h
+++ b/examples/ToyPathTracer/Source/enkiTS/Threads.h
@ -0,0 +1,210 @@
 // Copyright (c) 2013 Doug Binks
 // 
 // This software is provided 'as-is', without any express or implied
 // warranty. In no event will the authors be held liable for any damages
 // arising from the use of this software.
 // 
 // Permission is granted to anyone to use this software for any purpose,
 // including commercial applications, and to alter it and redistribute it
 // freely, subject to the following restrictions:
 // 
 // 1. The origin of this software must not be misrepresented; you must not
 //    claim that you wrote the original software. If you use this software
 //    in a product, an acknowledgement in the product documentation would be
 //    appreciated but is not required.
 // 2. Altered source versions must be plainly marked as such, and must not be
 //    misrepresented as being the original software.
 // 3. This notice may not be removed or altered from any source distribution.
 #pragma once
 #include <stdint.h>
 #include <assert.h>
 #ifdef _WIN32
 	#include "Atomics.h"
 	#define WIN32_LEAN_AND_MEAN
 	#include <Windows.h>
 	#define THREADFUNC_DECL DWORD WINAPI
 	#define THREAD_LOCAL __declspec( thread )
 namespace enki
 {
    typedef HANDLE threadid_t;
    // declare the thread start function as:
    // THREADFUNC_DECL MyThreadStart( void* pArg );
    inline bool ThreadCreate( threadid_t* returnid, DWORD ( WINAPI *StartFunc) (void* ), void* pArg )
    {
        // posix equiv pthread_create
        DWORD threadid;
        *returnid = CreateThread( 0, 0, StartFunc, pArg, 0, &threadid );
        return  *returnid != NULL;
    }
    inline bool ThreadTerminate( threadid_t threadid )
    {
        // posix equiv pthread_cancel
        return CloseHandle( threadid ) == 0;
    }
    inline uint32_t GetNumHardwareThreads()
    {
        SYSTEM_INFO sysInfo;
        GetSystemInfo(&sysInfo);
        return sysInfo.dwNumberOfProcessors;
    }
 }
 #else // posix
 	#include <pthread.h>
 	#include <unistd.h>
 	#define THREADFUNC_DECL void*
 	#define THREAD_LOCAL __thread
 namespace enki
 {
    typedef pthread_t threadid_t;  
    // declare the thread start function as:
    // THREADFUNC_DECL MyThreadStart( void* pArg );
    inline bool ThreadCreate( threadid_t* returnid, void* ( *StartFunc) (void* ), void* pArg )
    {
        // posix equiv pthread_create
        int32_t retval = pthread_create( returnid, NULL, StartFunc, pArg );
        return  retval == 0;
    }
    inline bool ThreadTerminate( threadid_t threadid )
    {
        // posix equiv pthread_cancel
        return pthread_cancel( threadid ) == 0;
    }
    inline uint32_t GetNumHardwareThreads()
    {
        return (uint32_t)sysconf( _SC_NPROCESSORS_ONLN );
    }
 }
 #endif // posix
 // Semaphore implementation
 #ifdef _WIN32
 namespace enki
 {
    struct semaphoreid_t
    {
        HANDLE      sem;
    };
 	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
    {
        semaphoreid.sem = CreateSemaphore(NULL, 0, MAXLONG, NULL );
    }
    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
    {
        CloseHandle( semaphoreid.sem );
    }
    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
    {
        DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE );
        assert( retval != WAIT_FAILED );
    }
    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
    {
 		if( countWaiting )
 		{
 			ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL );
 		}
    }
 }
 #elif defined(__MACH__)
 // OS X does not have POSIX semaphores
 // see https://developer.apple.com/library/content/documentation/Darwin/Conceptual/KernelProgramming/synchronization/synchronization.html
 #include <mach/mach.h>
 namespace enki
 {
    struct semaphoreid_t
    {
        semaphore_t   sem;
    };
 	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
    {
 		semaphore_create( mach_task_self(), &semaphoreid.sem, SYNC_POLICY_FIFO, 0 );
    }
    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
    {
        semaphore_destroy( mach_task_self(), semaphoreid.sem );
    }
    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
    {
        semaphore_wait( semaphoreid.sem );
    }
    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
    {
        while( countWaiting-- > 0 )
 		{
 			semaphore_signal( semaphoreid.sem );
 		}
    }
 }
 #else // POSIX
 #include <semaphore.h>
 namespace enki
 {
    struct semaphoreid_t
    {
        sem_t   sem;
    };
 	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
    {
 		int err = sem_init( &semaphoreid.sem, 0, 0 );
 		assert( err == 0 );
    }
    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
    {
        sem_destroy( &semaphoreid.sem );
    }
    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
    {
        int err = sem_wait( &semaphoreid.sem );
 		assert( err == 0 );
    }
    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
    {
        while( countWaiting-- > 0 )
 		{
 			sem_post( &semaphoreid.sem );
 		}
    }
 }
 #endif
--- a/examples/ToyPathTracer/Windows/ComputeShader.hlsl
+++ b/examples/ToyPathTracer/Windows/ComputeShader.hlsl
@ -0,0 +1,395 @@
 #include "../Source/Config.h"
 inline uint RNG(inout uint state)
 {
    uint x = state;
    x ^= x << 13;
    x ^= x >> 17;
    x ^= x << 15;
    state = x;
    return x;
 }
 float RandomFloat01(inout uint state)
 {
    return (RNG(state) & 0xFFFFFF) / 16777216.0f;
 }
 float3 RandomInUnitDisk(inout uint state)
 {
    float a = RandomFloat01(state) * 2.0f * 3.1415926f;
    float2 xy = float2(cos(a), sin(a));
    xy *= sqrt(RandomFloat01(state));
    return float3(xy, 0);
 }
 float3 RandomInUnitSphere(inout uint state)
 {
    float z = RandomFloat01(state) * 2.0f - 1.0f;
    float t = RandomFloat01(state) * 2.0f * 3.1415926f;
    float r = sqrt(max(0.0, 1.0f - z * z));
    float x = r * cos(t);
    float y = r * sin(t);
    float3 res = float3(x, y, z);
    res *= pow(RandomFloat01(state), 1.0 / 3.0);
    return res;
 }
 float3 RandomUnitVector(inout uint state)
 {
    float z = RandomFloat01(state) * 2.0f - 1.0f;
    float a = RandomFloat01(state) * 2.0f * 3.1415926f;
    float r = sqrt(1.0f - z * z);
    float x = r * cos(a);
    float y = r * sin(a);
    return float3(x, y, z);
 }
 struct Ray
 {
    float3 orig;
    float3 dir;
 };
 Ray MakeRay(float3 orig_, float3 dir_) { Ray r; r.orig = orig_; r.dir = dir_; return r; }
 float3 RayPointAt(Ray r, float t) { return r.orig + r.dir * t; }
 inline bool refract(float3 v, float3 n, float nint, out float3 outRefracted)
 {
    float dt = dot(v, n);
    float discr = 1.0f - nint * nint*(1 - dt * dt);
    if (discr > 0)
    {
        outRefracted = nint * (v - n * dt) - n * sqrt(discr);
        return true;
    }
    return false;
 }
 inline float schlick(float cosine, float ri)
 {
    float r0 = (1 - ri) / (1 + ri);
    r0 = r0 * r0;
    // note: saturate to guard against possible tiny negative numbers
    return r0 + (1 - r0)*pow(saturate(1 - cosine), 5);
 }
 struct Hit
 {
    float3 pos;
    float3 normal;
    float t;
 };
 struct Sphere
 {
    float3 center;
    float radius;
    float invRadius;
 };
 #define MatLambert 0
 #define MatMetal 1
 #define MatDielectric 2
 struct Material
 {
    int type;
    float3 albedo;
    float3 emissive;
    float roughness;
    float ri;
 };
 groupshared Sphere s_GroupSpheres[kCSMaxObjects];
 groupshared Material s_GroupMaterials[kCSMaxObjects];
 groupshared int s_GroupEmissives[kCSMaxObjects];
 struct Camera
 {
    float3 origin;
    float3 lowerLeftCorner;
    float3 horizontal;
    float3 vertical;
    float3 u, v, w;
    float lensRadius;
 };
 Ray CameraGetRay(Camera cam, float s, float t, inout uint state)
 {
    float3 rd = cam.lensRadius * RandomInUnitDisk(state);
    float3 offset = cam.u * rd.x + cam.v * rd.y;
    return MakeRay(cam.origin + offset, normalize(cam.lowerLeftCorner + s * cam.horizontal + t * cam.vertical - cam.origin - offset));
 }
 int HitSpheres(Ray r, int sphereCount, float tMin, float tMax, inout Hit outHit)
 {
    float hitT = tMax;
    int id = -1;
    for (int i = 0; i < sphereCount; ++i)
    {
        Sphere s = s_GroupSpheres[i];
        float3 co = s.center - r.orig;
        float nb = dot(co, r.dir);
        float c = dot(co, co) - s.radius*s.radius;
        float discr = nb * nb - c;
        if (discr > 0)
        {
            float discrSq = sqrt(discr);
            // Try earlier t
            float t = nb - discrSq;
            if (t <= tMin) // before min, try later t!
                t = nb + discrSq;
            if (t > tMin && t < hitT)
            {
                id = i;
                hitT = t;
            }
        }
    }
    if (id != -1)
    {
        outHit.pos = RayPointAt(r, hitT);
        outHit.normal = (outHit.pos - s_GroupSpheres[id].center) * s_GroupSpheres[id].invRadius;
        outHit.t = hitT;
    }
    return id;
 }
 struct Params
 {
    Camera cam;
    int sphereCount;
    int screenWidth;
    int screenHeight;
    int frames;
    float invWidth;
    float invHeight;
    float lerpFac;
    int emissiveCount;
 };
 #define kMinT 0.001f
 #define kMaxT 1.0e7f
 #define kMaxDepth 10
 static int HitWorld(int sphereCount, Ray r, float tMin, float tMax, inout Hit outHit)
 {
    return HitSpheres(r, sphereCount, tMin, tMax, outHit);
 }
 static bool Scatter(int sphereCount, int emissiveCount, int matID, Ray r_in, Hit rec, out float3 attenuation, out Ray scattered, out float3 outLightE, inout int inoutRayCount, inout uint state)
 {
    outLightE = float3(0, 0, 0);
    Material mat = s_GroupMaterials[matID];
    if (mat.type == MatLambert)
    {
        // random point on unit sphere that is tangent to the hit point
        float3 target = rec.pos + rec.normal + RandomUnitVector(state);
        scattered = MakeRay(rec.pos, normalize(target - rec.pos));
        attenuation = mat.albedo;
        // sample lights
 #if DO_LIGHT_SAMPLING
        for (int j = 0; j < emissiveCount; ++j)
        {
            int i = s_GroupEmissives[j];
            if (matID == i)
                continue; // skip self
            Material smat = s_GroupMaterials[i];
            Sphere s = s_GroupSpheres[i];
            // create a random direction towards sphere
            // coord system for sampling: sw, su, sv
            float3 sw = normalize(s.center - rec.pos);
            float3 su = normalize(cross(abs(sw.x)>0.01f ? float3(0, 1, 0) : float3(1, 0, 0), sw));
            float3 sv = cross(sw, su);
            // sample sphere by solid angle
            float cosAMax = sqrt(1.0f - s.radius*s.radius / dot(rec.pos - s.center, rec.pos - s.center));
            float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
            float cosA = 1.0f - eps1 + eps1 * cosAMax;
            float sinA = sqrt(1.0f - cosA * cosA);
            float phi = 2 * 3.1415926 * eps2;
            float3 l = su * cos(phi) * sinA + sv * sin(phi) * sinA + sw * cosA;
            // shoot shadow ray
            Hit lightHit;
            ++inoutRayCount;
            int hitID = HitWorld(sphereCount, MakeRay(rec.pos, l), kMinT, kMaxT, lightHit);
            if (hitID == i)
            {
                float omega = 2 * 3.1415926 * (1 - cosAMax);
                float3 rdir = r_in.dir;
                float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal;
                outLightE += (mat.albedo * smat.emissive) * (max(0.0f, dot(l, nl)) * omega / 3.1415926);
            }
        }
 #endif
        return true;
    }
    else if (mat.type == MatMetal)
    {
        float3 refl = reflect(r_in.dir, rec.normal);
        // reflected ray, and random inside of sphere based on roughness
        float roughness = mat.roughness;
 #if DO_MITSUBA_COMPARE
        roughness = 0; // until we get better BRDF for metals
 #endif
        scattered = MakeRay(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state)));
        attenuation = mat.albedo;
        return dot(scattered.dir, rec.normal) > 0;
    }
    else if (mat.type == MatDielectric)
    {
        float3 outwardN;
        float3 rdir = r_in.dir;
        float3 refl = reflect(rdir, rec.normal);
        float nint;
        attenuation = float3(1, 1, 1);
        float3 refr;
        float reflProb;
        float cosine;
        if (dot(rdir, rec.normal) > 0)
        {
            outwardN = -rec.normal;
            nint = mat.ri;
            cosine = mat.ri * dot(rdir, rec.normal);
        }
        else
        {
            outwardN = rec.normal;
            nint = 1.0f / mat.ri;
            cosine = -dot(rdir, rec.normal);
        }
        if (refract(rdir, outwardN, nint, refr))
        {
            reflProb = schlick(cosine, mat.ri);
        }
        else
        {
            reflProb = 1;
        }
        if (RandomFloat01(state) < reflProb)
            scattered = MakeRay(rec.pos, normalize(refl));
        else
            scattered = MakeRay(rec.pos, normalize(refr));
    }
    else
    {
        attenuation = float3(1, 0, 1);
        scattered = MakeRay(float3(0,0,0), float3(0, 0, 1));
        return false;
    }
    return true;
 }
 static float3 Trace(int sphereCount, int emissiveCount, Ray r, inout int inoutRayCount, inout uint state)
 {
    float3 col = 0;
    float3 curAtten = 1;
    bool doMaterialE = true;
    // GPUs don't support recursion, so do tracing iterations in a loop up to max depth
    for (int depth = 0; depth < kMaxDepth; ++depth)
    {
        Hit rec;
        ++inoutRayCount;
        int id = HitWorld(sphereCount, r, kMinT, kMaxT, rec);
        if (id >= 0)
        {
            Ray scattered;
            float3 attenuation;
            float3 lightE;
            Material mat = s_GroupMaterials[id];
            float3 matE = mat.emissive;
            if (Scatter(sphereCount, emissiveCount, id, r, rec, attenuation, scattered, lightE, inoutRayCount, state))
            {
 #if DO_LIGHT_SAMPLING
                if (!doMaterialE) matE = 0;
                doMaterialE = (mat.type != MatLambert);
 #endif
                col += curAtten * (matE + lightE);
                curAtten *= attenuation;
                r = scattered;
            }
            else
            {
                col += curAtten * matE;
                break;
            }
        }
        else
        {
            // sky
 #if DO_MITSUBA_COMPARE
            col += curAtten * float3(0.15f, 0.21f, 0.3f); // easier compare with Mitsuba's constant environment light
 #else
            float3 unitDir = r.dir;
            float t = 0.5f*(unitDir.y + 1.0f);
            float3 skyCol = ((1.0f - t)*float3(1.0f, 1.0f, 1.0f) + t * float3(0.5f, 0.7f, 1.0f)) * 0.3f;
            col += curAtten * skyCol;
 #endif
            break;
        }
    }
    return col;
 }
 Texture2D srcImage : register(t0);
 RWTexture2D<float4> dstImage : register(u0);
 StructuredBuffer<Sphere> g_Spheres : register(t1);
 StructuredBuffer<Material> g_Materials : register(t2);
 StructuredBuffer<Params> g_Params : register(t3);
 StructuredBuffer<int> g_Emissives : register(t4);
 RWByteAddressBuffer g_OutRayCount : register(u1);
 [numthreads(kCSGroupSizeX, kCSGroupSizeY, 1)]
 void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
 {
    // First, move scene data (spheres, materials, emissive indices) into group shared
    // memory. Do this in parallel; each thread in group copies its own chunk of data.
    uint threadID = tid.y * kCSGroupSizeX + tid.x;
    uint groupSize = kCSGroupSizeX * kCSGroupSizeY;
    uint objCount = g_Params[0].sphereCount;
    uint myObjCount = (objCount + groupSize - 1) / groupSize;
    uint myObjStart = threadID * myObjCount;
    for (uint io = myObjStart; io < myObjStart + myObjCount; ++io)
    {
        if (io < objCount)
        {
            s_GroupSpheres[io] = g_Spheres[io];
            s_GroupMaterials[io] = g_Materials[io];
        }
        if (io < g_Params[0].emissiveCount)
        {
            s_GroupEmissives[io] = g_Emissives[io];
        }
    }
    GroupMemoryBarrierWithGroupSync();
    int rayCount = 0;
    float3 col = 0;
    Params params = g_Params[0];
    uint rngState = (gid.x * 1973 + gid.y * 9277 + params.frames * 26699) | 1;
    for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++)
    {
        float u = float(gid.x + RandomFloat01(rngState)) * params.invWidth;
        float v = float(gid.y + RandomFloat01(rngState)) * params.invHeight;
        Ray r = CameraGetRay(params.cam, u, v, rngState);
        col += Trace(params.sphereCount, params.emissiveCount, r, rayCount, rngState);
    }
    col *= 1.0f / float(DO_SAMPLES_PER_PIXEL);
    float3 prev = srcImage.Load(int3(gid.xy,0)).rgb;
    col = lerp(col, prev, params.lerpFac);
    dstImage[gid.xy] = float4(col, 1);
    g_OutRayCount.InterlockedAdd(0, rayCount);
 }
--- a/examples/ToyPathTracer/Windows/PixelShader.hlsl
+++ b/examples/ToyPathTracer/Windows/PixelShader.hlsl
@ -0,0 +1,15 @@
 float3 LinearToSRGB(float3 rgb)
 {
    rgb = max(rgb, float3(0, 0, 0));
    return max(1.055 * pow(rgb, 0.416666667) - 0.055, 0.0);
 }
 Texture2D tex : register(t0);
 SamplerState smp : register(s0);
 float4 main(float2 uv : TEXCOORD0) : SV_Target
 {
    float3 col = tex.Sample(smp, uv).rgb;
    col = LinearToSRGB(col);
    return float4(col, 1.0f);
 }
--- a/examples/ToyPathTracer/Windows/TestCpu.sln
+++ b/examples/ToyPathTracer/Windows/TestCpu.sln
@ -0,0 +1,31 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 15
 VisualStudioVersion = 15.0.27130.2036
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TestCpu", "TestCpu.vcxproj", "{4F84B756-87F5-4B92-827B-DA087DAE1900}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Debug|x86 = Debug|x86
 		Release|x64 = Release|x64
 		Release|x86 = Release|x86
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x64.ActiveCfg = Debug|x64
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x64.Build.0 = Debug|x64
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x86.ActiveCfg = Debug|Win32
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x86.Build.0 = Debug|Win32
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x64.ActiveCfg = Release|x64
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x64.Build.0 = Release|x64
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x86.ActiveCfg = Release|Win32
 		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {067FB780-37B8-465E-AD7E-E7B238B9C04F}
 	EndGlobalSection
 EndGlobal
--- a/examples/ToyPathTracer/Windows/TestCpu.vcxproj
+++ b/examples/ToyPathTracer/Windows/TestCpu.vcxproj
@ -0,0 +1,242 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>15.0</VCProjectVersion>
    <ProjectGuid>{4F84B756-87F5-4B92-827B-DA087DAE1900}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>TestCpu</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <CallingConvention>VectorCall</CallingConvention>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <CallingConvention>VectorCall</CallingConvention>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <ExceptionHandling>false</ExceptionHandling>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <CallingConvention>VectorCall</CallingConvention>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <ExceptionHandling>false</ExceptionHandling>
      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <CallingConvention>VectorCall</CallingConvention>
      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\Source\enkiTS\TaskScheduler.cpp" />
    <ClCompile Include="..\Source\enkiTS\TaskScheduler_c.cpp" />
    <ClCompile Include="..\Source\Maths.cpp" />
    <ClCompile Include="..\Source\Test.cpp" />
    <ClCompile Include="TestWin.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Source\Config.h" />
    <ClInclude Include="..\Source\enkiTS\Atomics.h" />
    <ClInclude Include="..\Source\enkiTS\LockLessMultiReadPipe.h" />
    <ClInclude Include="..\Source\enkiTS\TaskScheduler.h" />
    <ClInclude Include="..\Source\enkiTS\TaskScheduler_c.h" />
    <ClInclude Include="..\Source\enkiTS\Threads.h" />
    <ClInclude Include="..\Source\Maths.h" />
    <ClInclude Include="..\Source\MathSimd.h" />
    <ClInclude Include="..\Source\Test.h" />
    <ClInclude Include="..\Source\stb_image.h" />
  </ItemGroup>
  <ItemGroup>
    <None Include="..\.editorconfig" />
  </ItemGroup>
  <ItemGroup>
    <FxCompile Include="ComputeShader.hlsl">
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">5.0</ShaderModel>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Compute</ShaderType>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">5.0</ShaderModel>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">5.0</ShaderModel>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Compute</ShaderType>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">5.0</ShaderModel>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">g_CSBytecode</VariableName>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompiledComputeShader.h</HeaderFileOutput>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">g_CSBytecode</VariableName>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompiledComputeShader.h</HeaderFileOutput>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">g_CSBytecode</VariableName>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompiledComputeShader.h</HeaderFileOutput>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">g_CSBytecode</VariableName>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompiledComputeShader.h</HeaderFileOutput>
    </FxCompile>
    <FxCompile Include="PixelShader.hlsl">
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Pixel</ShaderType>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Pixel</ShaderType>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Pixel</ShaderType>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Pixel</ShaderType>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">5.0</ShaderModel>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">5.0</ShaderModel>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">5.0</ShaderModel>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">5.0</ShaderModel>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompiledPixelShader.h</HeaderFileOutput>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompiledPixelShader.h</HeaderFileOutput>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompiledPixelShader.h</HeaderFileOutput>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompiledPixelShader.h</HeaderFileOutput>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">g_PSBytecode</VariableName>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">g_PSBytecode</VariableName>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">g_PSBytecode</VariableName>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">g_PSBytecode</VariableName>
    </FxCompile>
    <FxCompile Include="VertexShader.hlsl">
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Vertex</ShaderType>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Vertex</ShaderType>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Vertex</ShaderType>
      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Vertex</ShaderType>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">5.0</ShaderModel>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">5.0</ShaderModel>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">5.0</ShaderModel>
      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">5.0</ShaderModel>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompiledVertexShader.h</HeaderFileOutput>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompiledVertexShader.h</HeaderFileOutput>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompiledVertexShader.h</HeaderFileOutput>
      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompiledVertexShader.h</HeaderFileOutput>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">g_VSBytecode</VariableName>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">g_VSBytecode</VariableName>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">g_VSBytecode</VariableName>
      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">g_VSBytecode</VariableName>
    </FxCompile>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
 </Project>
--- a/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters
+++ b/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters
@ -0,0 +1,66 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="TestWin.cpp" />
    <ClCompile Include="..\Source\Test.cpp">
      <Filter>Source</Filter>
    </ClCompile>
    <ClCompile Include="..\Source\enkiTS\TaskScheduler.cpp">
      <Filter>Source\enkiTS</Filter>
    </ClCompile>
    <ClCompile Include="..\Source\enkiTS\TaskScheduler_c.cpp">
      <Filter>Source\enkiTS</Filter>
    </ClCompile>
    <ClCompile Include="..\Source\Maths.cpp">
      <Filter>Source</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Source">
      <UniqueIdentifier>{5f19f217-c1c7-4eeb-be61-8b986fee9375}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source\enkiTS">
      <UniqueIdentifier>{38c448a8-1dcc-4116-9410-a9f8d068caff}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Source\Test.h">
      <Filter>Source</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\stb_image.h">
      <Filter>Source</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\enkiTS\Atomics.h">
      <Filter>Source\enkiTS</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\enkiTS\LockLessMultiReadPipe.h">
      <Filter>Source\enkiTS</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\enkiTS\TaskScheduler.h">
      <Filter>Source\enkiTS</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\enkiTS\TaskScheduler_c.h">
      <Filter>Source\enkiTS</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\enkiTS\Threads.h">
      <Filter>Source\enkiTS</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\Maths.h">
      <Filter>Source</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\Config.h">
      <Filter>Source</Filter>
    </ClInclude>
    <ClInclude Include="..\Source\MathSimd.h">
      <Filter>Source</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\.editorconfig" />
  </ItemGroup>
  <ItemGroup>
    <FxCompile Include="VertexShader.hlsl" />
    <FxCompile Include="PixelShader.hlsl" />
    <FxCompile Include="ComputeShader.hlsl" />
  </ItemGroup>
 </Project>
--- a/examples/ToyPathTracer/Windows/TestWin.cpp
+++ b/examples/ToyPathTracer/Windows/TestWin.cpp
@ -0,0 +1,540 @@
 #include <stdint.h>
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <d3d11_1.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
 #include <algorithm>
 #include "../Source/Config.h"
 #include "../Source/Maths.h"
 #include "../Source/Test.h"
 #include "CompiledVertexShader.h"
 #include "CompiledPixelShader.h"
 static HINSTANCE g_HInstance;
 static HWND g_Wnd;
 ATOM                MyRegisterClass(HINSTANCE hInstance);
 BOOL                InitInstance(HINSTANCE, int);
 LRESULT CALLBACK    WndProc(HWND, UINT, WPARAM, LPARAM);
 INT_PTR CALLBACK    About(HWND, UINT, WPARAM, LPARAM);
 static HRESULT InitD3DDevice();
 static void ShutdownD3DDevice();
 static void RenderFrame();
 static float* g_Backbuffer;
 static D3D_FEATURE_LEVEL g_D3D11FeatureLevel = D3D_FEATURE_LEVEL_11_0;
 static ID3D11Device* g_D3D11Device = nullptr;
 static ID3D11DeviceContext* g_D3D11Ctx = nullptr;
 static IDXGISwapChain* g_D3D11SwapChain = nullptr;
 static ID3D11RenderTargetView* g_D3D11RenderTarget = nullptr;
 static ID3D11VertexShader* g_VertexShader;
 static ID3D11PixelShader* g_PixelShader;
 static ID3D11Texture2D *g_BackbufferTexture, *g_BackbufferTexture2;
 static ID3D11ShaderResourceView *g_BackbufferSRV, *g_BackbufferSRV2;
 static ID3D11UnorderedAccessView *g_BackbufferUAV, *g_BackbufferUAV2;
 static ID3D11SamplerState* g_SamplerLinear;
 static ID3D11RasterizerState* g_RasterState;
 static int g_BackbufferIndex;
 #if DO_COMPUTE_GPU
 #include "CompiledComputeShader.h"
 struct ComputeParams
 {
    Camera cam;
    int sphereCount;
    int screenWidth;
    int screenHeight;
    int frames;
    float invWidth;
    float invHeight;
    float lerpFac;
    int emissiveCount;
 };
 static ID3D11ComputeShader* g_ComputeShader;
 static ID3D11Buffer* g_DataSpheres;     static ID3D11ShaderResourceView* g_SRVSpheres;
 static ID3D11Buffer* g_DataMaterials;   static ID3D11ShaderResourceView* g_SRVMaterials;
 static ID3D11Buffer* g_DataParams;      static ID3D11ShaderResourceView* g_SRVParams;
 static ID3D11Buffer* g_DataEmissives;   static ID3D11ShaderResourceView* g_SRVEmissives;
 static ID3D11Buffer* g_DataCounter;     static ID3D11UnorderedAccessView* g_UAVCounter;
 static int g_SphereCount, g_ObjSize, g_MatSize;
 static ID3D11Query *g_QueryBegin, *g_QueryEnd, *g_QueryDisjoint;
 #endif // #if DO_COMPUTE_GPU
 int APIENTRY wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR, _In_ int nCmdShow)
 {
    g_Backbuffer = new float[kBackbufferWidth * kBackbufferHeight * 4];
    memset(g_Backbuffer, 0, kBackbufferWidth * kBackbufferHeight * 4 * sizeof(g_Backbuffer[0]));
    InitializeTest();
    MyRegisterClass(hInstance);
    if (!InitInstance (hInstance, nCmdShow))
    {
        return FALSE;
    }
    if (FAILED(InitD3DDevice()))
    {
        ShutdownD3DDevice();
        return 0;
    }
    g_D3D11Device->CreateVertexShader(g_VSBytecode, ARRAYSIZE(g_VSBytecode), NULL, &g_VertexShader);
    g_D3D11Device->CreatePixelShader(g_PSBytecode, ARRAYSIZE(g_PSBytecode), NULL, &g_PixelShader);
 #if DO_COMPUTE_GPU
    g_D3D11Device->CreateComputeShader(g_CSBytecode, ARRAYSIZE(g_CSBytecode), NULL, &g_ComputeShader);
 #endif
    D3D11_TEXTURE2D_DESC texDesc = {};
    texDesc.Width = kBackbufferWidth;
    texDesc.Height = kBackbufferHeight;
    texDesc.MipLevels = 1;
    texDesc.ArraySize = 1;
    texDesc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
    texDesc.SampleDesc.Count = 1;
    texDesc.SampleDesc.Quality = 0;
 #if DO_COMPUTE_GPU
    texDesc.Usage = D3D11_USAGE_DEFAULT;
    texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
    texDesc.CPUAccessFlags = 0;
 #else
    texDesc.Usage = D3D11_USAGE_DYNAMIC;
    texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
    texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
 #endif
    texDesc.MiscFlags = 0;
    g_D3D11Device->CreateTexture2D(&texDesc, NULL, &g_BackbufferTexture);
    g_D3D11Device->CreateTexture2D(&texDesc, NULL, &g_BackbufferTexture2);
    D3D11_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
    srvDesc.Format = texDesc.Format;
    srvDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
    srvDesc.Texture2D.MipLevels = 1;
    srvDesc.Texture2D.MostDetailedMip = 0;
    g_D3D11Device->CreateShaderResourceView(g_BackbufferTexture, &srvDesc, &g_BackbufferSRV);
    g_D3D11Device->CreateShaderResourceView(g_BackbufferTexture2, &srvDesc, &g_BackbufferSRV2);
    D3D11_SAMPLER_DESC smpDesc = {};
    smpDesc.Filter = D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT;
    smpDesc.AddressU = smpDesc.AddressV = smpDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
    g_D3D11Device->CreateSamplerState(&smpDesc, &g_SamplerLinear);
    D3D11_RASTERIZER_DESC rasterDesc = {};
    rasterDesc.FillMode = D3D11_FILL_SOLID;
    rasterDesc.CullMode = D3D11_CULL_NONE;
    g_D3D11Device->CreateRasterizerState(&rasterDesc, &g_RasterState);
 #if DO_COMPUTE_GPU
    D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
    int camSize;
    GetObjectCount(g_SphereCount, g_ObjSize, g_MatSize, camSize);
    assert(g_ObjSize == 20);
    assert(g_MatSize == 36);
    assert(camSize == 88);
    D3D11_BUFFER_DESC bdesc = {};
    bdesc.ByteWidth = g_SphereCount * g_ObjSize;
    bdesc.Usage = D3D11_USAGE_DEFAULT;
    bdesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
    bdesc.CPUAccessFlags = 0;
    bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
    bdesc.StructureByteStride = g_ObjSize;
    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataSpheres);
    srvDesc.Format = DXGI_FORMAT_UNKNOWN;
    srvDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
    srvDesc.Buffer.FirstElement = 0;
    srvDesc.Buffer.NumElements = g_SphereCount;
    g_D3D11Device->CreateShaderResourceView(g_DataSpheres, &srvDesc, &g_SRVSpheres);
    bdesc.ByteWidth = g_SphereCount * g_MatSize;
    bdesc.StructureByteStride = g_MatSize;
    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataMaterials);
    srvDesc.Buffer.NumElements = g_SphereCount;
    g_D3D11Device->CreateShaderResourceView(g_DataMaterials, &srvDesc, &g_SRVMaterials);
    bdesc.ByteWidth = sizeof(ComputeParams);
    bdesc.StructureByteStride = sizeof(ComputeParams);
    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataParams);
    srvDesc.Buffer.NumElements = 1;
    g_D3D11Device->CreateShaderResourceView(g_DataParams, &srvDesc, &g_SRVParams);
    bdesc.ByteWidth = g_SphereCount * 4;
    bdesc.StructureByteStride = 4;
    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataEmissives);
    srvDesc.Buffer.NumElements = g_SphereCount;
    g_D3D11Device->CreateShaderResourceView(g_DataEmissives, &srvDesc, &g_SRVEmissives);
    bdesc.ByteWidth = 4;
    bdesc.BindFlags |= D3D11_BIND_UNORDERED_ACCESS;
    bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
    bdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataCounter);
    uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
    uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
    uavDesc.Buffer.FirstElement = 0;
    uavDesc.Buffer.NumElements = 1;
    uavDesc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
    g_D3D11Device->CreateUnorderedAccessView(g_DataCounter, &uavDesc, &g_UAVCounter);
    uavDesc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
    uavDesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
    uavDesc.Texture2D.MipSlice = 0;
    g_D3D11Device->CreateUnorderedAccessView(g_BackbufferTexture, &uavDesc, &g_BackbufferUAV);
    g_D3D11Device->CreateUnorderedAccessView(g_BackbufferTexture2, &uavDesc, &g_BackbufferUAV2);
    D3D11_QUERY_DESC qDesc = {};
    qDesc.Query = D3D11_QUERY_TIMESTAMP;
    g_D3D11Device->CreateQuery(&qDesc, &g_QueryBegin);
    g_D3D11Device->CreateQuery(&qDesc, &g_QueryEnd);
    qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
    g_D3D11Device->CreateQuery(&qDesc, &g_QueryDisjoint);
 #endif // #if DO_COMPUTE_GPU
    // Main message loop
    MSG msg = { 0 };
    while (msg.message != WM_QUIT)
    {
        if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE))
        {
            TranslateMessage(&msg);
            DispatchMessage(&msg);
        }
        else
        {
            RenderFrame();
        }
    }
    ShutdownTest();
    ShutdownD3DDevice();
    return (int) msg.wParam;
 }
 ATOM MyRegisterClass(HINSTANCE hInstance)
 {
    WNDCLASSEXW wcex;
    memset(&wcex, 0, sizeof(wcex));
    wcex.cbSize = sizeof(WNDCLASSEX);
    wcex.style          = CS_HREDRAW | CS_VREDRAW;
    wcex.lpfnWndProc    = WndProc;
    wcex.cbClsExtra     = 0;
    wcex.cbWndExtra     = 0;
    wcex.hInstance      = hInstance;
    wcex.hCursor        = LoadCursor(nullptr, IDC_ARROW);
    wcex.hbrBackground  = (HBRUSH)(COLOR_WINDOW+1);
    wcex.lpszClassName  = L"TestClass";
    return RegisterClassExW(&wcex);
 }
 BOOL InitInstance(HINSTANCE hInstance, int nCmdShow)
 {
    g_HInstance = hInstance;
    RECT rc = { 0, 0, kBackbufferWidth, kBackbufferHeight };
    DWORD style = WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_MINIMIZEBOX;
    AdjustWindowRect(&rc, style, FALSE);
    HWND hWnd = CreateWindowW(L"TestClass", L"Test", style, CW_USEDEFAULT, CW_USEDEFAULT, rc.right-rc.left, rc.bottom-rc.top, nullptr, nullptr, hInstance, nullptr);
    if (!hWnd)
        return FALSE;
    g_Wnd = hWnd;
    ShowWindow(hWnd, nCmdShow);
    UpdateWindow(hWnd);
    return TRUE;
 }
 static uint64_t s_Time;
 static int s_Count;
 static char s_Buffer[200];
 static unsigned s_Flags = kFlagProgressive;
 static int s_FrameCount = 0;
 static void RenderFrame()
 {
    LARGE_INTEGER time1;
 #if DO_COMPUTE_GPU
    QueryPerformanceCounter(&time1);
    float t = float(clock()) / CLOCKS_PER_SEC;
    UpdateTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, s_Flags);
    g_BackbufferIndex = 1 - g_BackbufferIndex;
    void* dataSpheres = alloca(g_SphereCount * g_ObjSize);
    void* dataMaterials = alloca(g_SphereCount * g_MatSize);
    void* dataEmissives = alloca(g_SphereCount * 4);
    ComputeParams dataParams;
    GetSceneDesc(dataSpheres, dataMaterials, &dataParams.cam, dataEmissives, &dataParams.emissiveCount);
    dataParams.sphereCount = g_SphereCount;
    dataParams.screenWidth = kBackbufferWidth;
    dataParams.screenHeight = kBackbufferHeight;
    dataParams.frames = s_FrameCount;
    dataParams.invWidth = 1.0f / kBackbufferWidth;
    dataParams.invHeight = 1.0f / kBackbufferHeight;
    float lerpFac = float(s_FrameCount) / float(s_FrameCount + 1);
    if (s_Flags & kFlagAnimate)
        lerpFac *= DO_ANIMATE_SMOOTHING;
    if (!(s_Flags & kFlagProgressive))
        lerpFac = 0;
    dataParams.lerpFac = lerpFac;
    g_D3D11Ctx->UpdateSubresource(g_DataSpheres, 0, NULL, dataSpheres, 0, 0);
    g_D3D11Ctx->UpdateSubresource(g_DataMaterials, 0, NULL, dataMaterials, 0, 0);
    g_D3D11Ctx->UpdateSubresource(g_DataParams, 0, NULL, &dataParams, 0, 0);
    g_D3D11Ctx->UpdateSubresource(g_DataEmissives, 0, NULL, dataEmissives, 0, 0);
    ID3D11ShaderResourceView* srvs[] = {
        g_BackbufferIndex == 0 ? g_BackbufferSRV2 : g_BackbufferSRV,
        g_SRVSpheres,
        g_SRVMaterials,
        g_SRVParams,
        g_SRVEmissives
    };
    g_D3D11Ctx->CSSetShaderResources(0, ARRAYSIZE(srvs), srvs);
    ID3D11UnorderedAccessView* uavs[] = {
        g_BackbufferIndex == 0 ? g_BackbufferUAV : g_BackbufferUAV2,
        g_UAVCounter
    };
    g_D3D11Ctx->CSSetUnorderedAccessViews(0, ARRAYSIZE(uavs), uavs, NULL);
    g_D3D11Ctx->CSSetShader(g_ComputeShader, NULL, 0);
    g_D3D11Ctx->Begin(g_QueryDisjoint);
    g_D3D11Ctx->End(g_QueryBegin);
    g_D3D11Ctx->Dispatch(kBackbufferWidth/kCSGroupSizeX, kBackbufferHeight/kCSGroupSizeY, 1);
    g_D3D11Ctx->End(g_QueryEnd);
    uavs[0] = NULL;
    g_D3D11Ctx->CSSetUnorderedAccessViews(0, ARRAYSIZE(uavs), uavs, NULL);
    ++s_FrameCount;
 #else
    QueryPerformanceCounter(&time1);
    float t = float(clock()) / CLOCKS_PER_SEC;
    static size_t s_RayCounter = 0;
    int rayCount;
    UpdateTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, s_Flags);
    DrawTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, g_Backbuffer, rayCount, s_Flags);
    s_FrameCount++;
    s_RayCounter += rayCount;
    LARGE_INTEGER time2;
    QueryPerformanceCounter(&time2);
    uint64_t dt = time2.QuadPart - time1.QuadPart;
    ++s_Count;
    s_Time += dt;
    if (s_Count > 10)
    {
        LARGE_INTEGER frequency;
        QueryPerformanceFrequency(&frequency);
        double s = double(s_Time) / double(frequency.QuadPart) / s_Count;
        sprintf_s(s_Buffer, sizeof(s_Buffer), "%.2fms (%.1f FPS) %.1fMrays/s %.2fMrays/frame frames %i\n", s * 1000.0f, 1.f / s, s_RayCounter / s_Count / s * 1.0e-6f, s_RayCounter / s_Count * 1.0e-6f, s_FrameCount);
        SetWindowTextA(g_Wnd, s_Buffer);
        OutputDebugStringA(s_Buffer);
        s_Count = 0;
        s_Time = 0;
        s_RayCounter = 0;
    }
    D3D11_MAPPED_SUBRESOURCE mapped;
    g_D3D11Ctx->Map(g_BackbufferTexture, 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped);
    const uint8_t* src = (const uint8_t*)g_Backbuffer;
    uint8_t* dst = (uint8_t*)mapped.pData;
    for (int y = 0; y < kBackbufferHeight; ++y)
    {
        memcpy(dst, src, kBackbufferWidth * 16);
        src += kBackbufferWidth * 16;
        dst += mapped.RowPitch;
    }
    g_D3D11Ctx->Unmap(g_BackbufferTexture, 0);
 #endif
    g_D3D11Ctx->VSSetShader(g_VertexShader, NULL, 0);
    g_D3D11Ctx->PSSetShader(g_PixelShader, NULL, 0);
    g_D3D11Ctx->PSSetShaderResources(0, 1, g_BackbufferIndex == 0 ? &g_BackbufferSRV : &g_BackbufferSRV2);
    g_D3D11Ctx->PSSetSamplers(0, 1, &g_SamplerLinear);
    g_D3D11Ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
    g_D3D11Ctx->RSSetState(g_RasterState);
    g_D3D11Ctx->Draw(3, 0);
    g_D3D11SwapChain->Present(0, 0);
 #if DO_COMPUTE_GPU
    g_D3D11Ctx->End(g_QueryDisjoint);
    // get GPU times
    while (g_D3D11Ctx->GetData(g_QueryDisjoint, NULL, 0, 0) == S_FALSE) { Sleep(0); }
    D3D10_QUERY_DATA_TIMESTAMP_DISJOINT tsDisjoint;
    g_D3D11Ctx->GetData(g_QueryDisjoint, &tsDisjoint, sizeof(tsDisjoint), 0);
    if (!tsDisjoint.Disjoint)
    {
        UINT64 tsBegin, tsEnd;
        // Note: on some GPUs/drivers, even when the disjoint query above already said "yeah I have data",
        // might still not return "I have data" for timestamp queries before it.
        while (g_D3D11Ctx->GetData(g_QueryBegin, &tsBegin, sizeof(tsBegin), 0) == S_FALSE) { Sleep(0); }
        while (g_D3D11Ctx->GetData(g_QueryEnd, &tsEnd, sizeof(tsEnd), 0) == S_FALSE) { Sleep(0); }
        float s = float(tsEnd - tsBegin) / float(tsDisjoint.Frequency);
        static uint64_t s_RayCounter;
        D3D11_MAPPED_SUBRESOURCE mapped;
        g_D3D11Ctx->Map(g_DataCounter, 0, D3D11_MAP_READ, 0, &mapped);
        s_RayCounter += *(const int*)mapped.pData;
        g_D3D11Ctx->Unmap(g_DataCounter, 0);
        int zeroCount = 0;
        g_D3D11Ctx->UpdateSubresource(g_DataCounter, 0, NULL, &zeroCount, 0, 0);
        static float s_Time;
        ++s_Count;
        s_Time += s;
        if (s_Count > 150)
        {
            s = s_Time / s_Count;
            sprintf_s(s_Buffer, sizeof(s_Buffer), "%.2fms (%.1f FPS) %.1fMrays/s %.2fMrays/frame frames %i\n", s * 1000.0f, 1.f / s, s_RayCounter / s_Count / s * 1.0e-6f, s_RayCounter / s_Count * 1.0e-6f, s_FrameCount);
            SetWindowTextA(g_Wnd, s_Buffer);
            s_Count = 0;
            s_Time = 0;
            s_RayCounter = 0;
        }
    }
 #endif // #if DO_COMPUTE_GPU
 }
 LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
 {
    switch (message)
    {
    case WM_PAINT:
        {
            PAINTSTRUCT ps;
            HDC hdc = BeginPaint(hWnd, &ps);
            EndPaint(hWnd, &ps);
        }
        break;
    case WM_DESTROY:
        PostQuitMessage(0);
        break;
    case WM_CHAR:
        if (wParam == 'a')
            s_Flags = s_Flags ^ kFlagAnimate;
        if (wParam == 'p')
        {
            s_Flags = s_Flags ^ kFlagProgressive;
            s_FrameCount = 0;
        }
        break;
    default:
        return DefWindowProc(hWnd, message, wParam, lParam);
    }
    return 0;
 }
 static HRESULT InitD3DDevice()
 {
    HRESULT hr = S_OK;
    RECT rc;
    GetClientRect(g_Wnd, &rc);
    UINT width = rc.right - rc.left;
    UINT height = rc.bottom - rc.top;
    UINT createDeviceFlags = 0;
 #ifdef _DEBUG
    createDeviceFlags |= D3D11_CREATE_DEVICE_DEBUG;
 #endif
    D3D_FEATURE_LEVEL featureLevels[] =
    {
        D3D_FEATURE_LEVEL_11_0,
    };
    UINT numFeatureLevels = ARRAYSIZE(featureLevels);
    hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, createDeviceFlags, featureLevels, numFeatureLevels, D3D11_SDK_VERSION, &g_D3D11Device, &g_D3D11FeatureLevel, &g_D3D11Ctx);
    if (FAILED(hr))
        return hr;
    // Get DXGI factory
    IDXGIFactory1* dxgiFactory = nullptr;
    {
        IDXGIDevice* dxgiDevice = nullptr;
        hr = g_D3D11Device->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void**>(&dxgiDevice));
        if (SUCCEEDED(hr))
        {
            IDXGIAdapter* adapter = nullptr;
            hr = dxgiDevice->GetAdapter(&adapter);
            if (SUCCEEDED(hr))
            {
                hr = adapter->GetParent(__uuidof(IDXGIFactory1), reinterpret_cast<void**>(&dxgiFactory));
                adapter->Release();
            }
            dxgiDevice->Release();
        }
    }
    if (FAILED(hr))
        return hr;
    // Create swap chain
    DXGI_SWAP_CHAIN_DESC sd;
    ZeroMemory(&sd, sizeof(sd));
    sd.BufferCount = 1;
    sd.BufferDesc.Width = width;
    sd.BufferDesc.Height = height;
    sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
    sd.BufferDesc.RefreshRate.Numerator = 60;
    sd.BufferDesc.RefreshRate.Denominator = 1;
    sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
    sd.OutputWindow = g_Wnd;
    sd.SampleDesc.Count = 1;
    sd.SampleDesc.Quality = 0;
    sd.Windowed = TRUE;
    hr = dxgiFactory->CreateSwapChain(g_D3D11Device, &sd, &g_D3D11SwapChain);
    // Prevent Alt-Enter
    dxgiFactory->MakeWindowAssociation(g_Wnd, DXGI_MWA_NO_ALT_ENTER);
    dxgiFactory->Release();
    if (FAILED(hr))
        return hr;
    // RTV
    ID3D11Texture2D* pBackBuffer = nullptr;
    hr = g_D3D11SwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), reinterpret_cast<void**>(&pBackBuffer));
    if (FAILED(hr))
        return hr;
    hr = g_D3D11Device->CreateRenderTargetView(pBackBuffer, nullptr, &g_D3D11RenderTarget);
    pBackBuffer->Release();
    if (FAILED(hr))
        return hr;
    g_D3D11Ctx->OMSetRenderTargets(1, &g_D3D11RenderTarget, nullptr);
    // Viewport
    D3D11_VIEWPORT vp;
    vp.Width = (float)width;
    vp.Height = (float)height;
    vp.MinDepth = 0.0f;
    vp.MaxDepth = 1.0f;
    vp.TopLeftX = 0;
    vp.TopLeftY = 0;
    g_D3D11Ctx->RSSetViewports(1, &vp);
    return S_OK;
 }
 static void ShutdownD3DDevice()
 {
    if (g_D3D11Ctx) g_D3D11Ctx->ClearState();
    if (g_D3D11RenderTarget) g_D3D11RenderTarget->Release();
    if (g_D3D11SwapChain) g_D3D11SwapChain->Release();
    if (g_D3D11Ctx) g_D3D11Ctx->Release();
    if (g_D3D11Device) g_D3D11Device->Release();
 }
--- a/examples/ToyPathTracer/Windows/VertexShader.hlsl
+++ b/examples/ToyPathTracer/Windows/VertexShader.hlsl
@ -0,0 +1,13 @@
 struct vs2ps
 {
    float2 uv : TEXCOORD0;
    float4 pos : SV_Position;
 };
 vs2ps main(uint vid : SV_VertexID)
 {
    vs2ps o;
    o.uv = float2((vid << 1) & 2, vid & 2);
    o.pos = float4(o.uv * float2(2, 2) + float2(-1, -1), 0, 1);
    return o;
 }
--- a/examples/ToyPathTracer/license.md
+++ b/examples/ToyPathTracer/license.md
@ -0,0 +1,24 @@
 This is free and unencumbered software released into the public domain.
 Anyone is free to copy, modify, publish, use, compile, sell, or
 distribute this software, either in source code form or as a compiled
 binary, for any purpose, commercial or non-commercial, and by any
 means.
 In jurisdictions that recognize copyright laws, the author or authors
 of this software dedicate any and all copyright interest in the
 software to the public domain. We make this dedication for the benefit
 of the public at large and to the detriment of our heirs and
 successors. We intend this dedication to be an overt act of
 relinquishment in perpetuity of all present and future rights to this
 software under copyright law.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 For more information, please refer to <http://unlicense.org>