Add aras-p's ToyPathTracer.

https://github.com/aras-p/ToyPathTracer b076563906169aa2f9e6d7218ef85decf81f8f72
2024-11-25 15:34:36 +00:00 · 2019-10-29 22:21:34 +01:00 · 2019-10-29 22:21:34 +01:00 · 0b1eff8b0d
commit 0b1eff8b0d
parent 789b95f259
22 changed files with 3957 additions and 0 deletions
--- a/examples/ToyPathTracer/README
+++ b/examples/ToyPathTracer/README
@ -0,0 +1 @@
+https://github.com/aras-p/ToyPathTracer
--- a/examples/ToyPathTracer/Source/Config.h
+++ b/examples/ToyPathTracer/Source/Config.h
@ -0,0 +1,33 @@
+
+#if defined(__APPLE__) && !defined(__METAL_VERSION__)
+#include <TargetConditionals.h>
+#endif
+
+#define kBackbufferWidth 1280
+#define kBackbufferHeight 720
+
+#if defined(__EMSCRIPTEN__)
+#define CPU_CAN_DO_SIMD 0
+#define CPU_CAN_DO_THREADS 0
+#else
+#define CPU_CAN_DO_SIMD 1
+#define CPU_CAN_DO_THREADS 1
+#endif
+
+
+#define DO_SAMPLES_PER_PIXEL 4
+#define DO_ANIMATE_SMOOTHING 0.9f
+#define DO_LIGHT_SAMPLING 1
+#define DO_MITSUBA_COMPARE 0
+
+// Should path tracing be done on the GPU with a compute shader?
+#define DO_COMPUTE_GPU 0
+#define kCSGroupSizeX 8
+#define kCSGroupSizeY 8
+#define kCSMaxObjects 64
+
+// Should float3 struct use SSE/NEON?
+#define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && CPU_CAN_DO_SIMD && 1)
+
+// Should HitSpheres function use SSE/NEON?
+#define DO_HIT_SPHERES_SIMD (CPU_CAN_DO_SIMD && 1)
--- a/examples/ToyPathTracer/Source/MathSimd.h
+++ b/examples/ToyPathTracer/Source/MathSimd.h
@ -0,0 +1,192 @@
+#pragma once
+
+#if defined(_MSC_VER)
+#define VM_INLINE __forceinline
+#else
+#define VM_INLINE __attribute__((unused, always_inline, nodebug)) inline
+#endif
+
+#define kSimdWidth 4
+
+#if !defined(__arm__) && !defined(__arm64__) && !defined(__EMSCRIPTEN__)
+
+// ---- SSE implementation
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#define SHUFFLE4(V, X,Y,Z,W) float4(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(W,Z,Y,X)))
+
+struct float4
+{
+    VM_INLINE float4() {}
+    VM_INLINE explicit float4(const float *p) { m = _mm_loadu_ps(p); }
+    VM_INLINE explicit float4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
+    VM_INLINE explicit float4(float v) { m = _mm_set_ps1(v); }
+    VM_INLINE explicit float4(__m128 v) { m = v; }
+    
+    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
+    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
+    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
+    VM_INLINE float getW() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 3, 3, 3))); }
+    
+    __m128 m;
+};
+
+typedef float4 bool4;
+
+VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a, float4 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
+VM_INLINE float4 operator* (float4 a, float4 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = _mm_and_ps(a.m, b.m); return a; }
+VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = _mm_or_ps(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a) { a.m = _mm_xor_ps(a.m, _mm_set1_ps(-0.0f)); return a; }
+VM_INLINE float4 min(float4 a, float4 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
+VM_INLINE float4 max(float4 a, float4 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
+
+VM_INLINE float hmin(float4 v)
+{
+    v = min(v, SHUFFLE4(v, 2, 3, 0, 0));
+    v = min(v, SHUFFLE4(v, 1, 0, 0, 0));
+    return v.getX();
+}
+
+// Returns a 4-bit code where bit0..bit3 is X..W
+VM_INLINE unsigned mask(float4 v) { return _mm_movemask_ps(v.m); }
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
+
+// "select", i.e. hibit(cond) ? b : a
+// on SSE4.1 and up this can be done easily via "blend" instruction;
+// on older SSEs has to do a bunch of hoops, see
+// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
+
+VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
+{
+#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
+    a.m = _mm_blendv_ps(a.m, b.m, cond.m);
+#else
+    __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
+    a.m = _mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m));
+#endif
+    return a;
+}
+VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond)
+{
+#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1
+    return _mm_blendv_epi8(a, b, _mm_castps_si128(cond.m));
+#else
+    __m128i d = _mm_srai_epi32(_mm_castps_si128(cond.m), 31);
+    return _mm_or_si128(_mm_and_si128(d, b), _mm_andnot_si128(d, a));
+#endif
+}
+
+VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); }
+
+#elif !defined(__EMSCRIPTEN__)
+
+// ---- NEON implementation
+
+#define USE_NEON 1
+#include <arm_neon.h>
+
+struct float4
+{
+    VM_INLINE float4() {}
+    VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); }
+    VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); }
+    VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); }
+    VM_INLINE explicit float4(float32x4_t v) { m = v; }
+    
+    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
+    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
+    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
+    VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); }
+    
+    float32x4_t m;
+};
+
+typedef float4 bool4;
+
+VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; }
+VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
+VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
+VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; }
+VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; }
+VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; }
+VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; }
+VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
+
+VM_INLINE float hmin(float4 v)
+{
+    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
+    return vget_lane_f32(minOfMinOfHalfs, 0);
+}
+
+// Returns a 4-bit code where bit0..bit3 is X..W
+VM_INLINE unsigned mask(float4 v)
+{
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
+    uint32x4_t t1 = vtstq_u32(t0, highbit);
+    uint32x4_t t2 = vandq_u32(t1, movemask);
+    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+}
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool4 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool4 v) { return mask(v) == 15; }
+
+// "select", i.e. hibit(cond) ? b : a
+// on SSE4.1 and up this can be done easily via "blend" instruction;
+// on older SSEs has to do a bunch of hoops, see
+// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/
+
+VM_INLINE float4 select(float4 a, float4 b, bool4 cond)
+{
+    a.m = vbslq_f32(cond.m, b.m, a.m);
+    return a;
+}
+VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond)
+{
+    return vbslq_f32(cond.m, b, a);
+}
+
+VM_INLINE float4 sqrtf(float4 v)
+{
+    float32x4_t V = v.m;
+    float32x4_t S0 = vrsqrteq_f32(V);
+    float32x4_t P0 = vmulq_f32( V, S0 );
+    float32x4_t R0 = vrsqrtsq_f32( P0, S0 );
+    float32x4_t S1 = vmulq_f32( S0, R0 );
+    float32x4_t P1 = vmulq_f32( V, S1 );
+    float32x4_t R1 = vrsqrtsq_f32( P1, S1 );
+    float32x4_t S2 = vmulq_f32( S1, R1 );
+    float32x4_t P2 = vmulq_f32( V, S2 );
+    float32x4_t R2 = vrsqrtsq_f32( P2, S2 );
+    float32x4_t S3 = vmulq_f32( S2, R2 );
+    return float4(vmulq_f32(V, S3));
+}
+
+VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); }
+VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); }
+VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); }
+VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); }
+
+#endif
--- a/examples/ToyPathTracer/Source/Maths.cpp
+++ b/examples/ToyPathTracer/Source/Maths.cpp
@ -0,0 +1,203 @@
+#include "Maths.h"
+#include <stdlib.h>
+#include <stdint.h>
+
+static uint32_t XorShift32(uint32_t& state)
+{
+    uint32_t x = state;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 15;
+    state = x;
+    return x;
+}
+
+float RandomFloat01(uint32_t& state)
+{
+    return (XorShift32(state) & 0xFFFFFF) / 16777216.0f;
+}
+
+float3 RandomInUnitDisk(uint32_t& state)
+{
+    float3 p;
+    do
+    {
+        p = 2.0 * float3(RandomFloat01(state),RandomFloat01(state),0) - float3(1,1,0);
+    } while (dot(p,p) >= 1.0);
+    return p;
+}
+
+float3 RandomInUnitSphere(uint32_t& state)
+{
+    float3 p;
+    do {
+        p = 2.0*float3(RandomFloat01(state),RandomFloat01(state),RandomFloat01(state)) - float3(1,1,1);
+    } while (sqLength(p) >= 1.0);
+    return p;
+}
+
+float3 RandomUnitVector(uint32_t& state)
+{
+    float z = RandomFloat01(state) * 2.0f - 1.0f;
+    float a = RandomFloat01(state) * 2.0f * kPI;
+    float r = sqrtf(1.0f - z * z);
+    float x = r * cosf(a);
+    float y = r * sinf(a);
+    return float3(x, y, z);
+}
+
+
+int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit)
+{
+#if DO_HIT_SPHERES_SIMD
+    float4 hitT = float4(tMax);
+#if USE_NEON
+    int32x4_t id = vdupq_n_s32(-1);
+#else
+    __m128i id = _mm_set1_epi32(-1);
+#endif
+
+#if DO_FLOAT3_WITH_SIMD && !USE_NEON
+    float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0);
+    float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1);
+    float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2);
+    float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0);
+    float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1);
+    float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2);
+#elif DO_FLOAT3_WITH_SIMD
+    float4 rOrigX = splatX(r.orig.m);
+    float4 rOrigY = splatY(r.orig.m);
+    float4 rOrigZ = splatZ(r.orig.m);
+    float4 rDirX = splatX(r.dir.m);
+    float4 rDirY = splatY(r.dir.m);
+    float4 rDirZ = splatZ(r.dir.m);
+#else
+    float4 rOrigX = float4(r.orig.x);
+    float4 rOrigY = float4(r.orig.y);
+    float4 rOrigZ = float4(r.orig.z);
+    float4 rDirX = float4(r.dir.x);
+    float4 rDirY = float4(r.dir.y);
+    float4 rDirZ = float4(r.dir.z);
+#endif
+    float4 tMin4 = float4(tMin);
+#if USE_NEON
+    int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32)));
+#else
+    __m128i curId = _mm_set_epi32(3, 2, 1, 0);
+#endif
+    // process 4 spheres at once
+    for (int i = 0; i < spheres.simdCount; i += kSimdWidth)
+    {
+        // load data for 4 spheres
+        float4 sCenterX = float4(spheres.centerX + i);
+        float4 sCenterY = float4(spheres.centerY + i);
+        float4 sCenterZ = float4(spheres.centerZ + i);
+        float4 sSqRadius = float4(spheres.sqRadius + i);
+        // note: we flip this vector and calculate -b (nb) since that happens to be slightly preferable computationally
+        float4 coX = sCenterX - rOrigX;
+        float4 coY = sCenterY - rOrigY;
+        float4 coZ = sCenterZ - rOrigZ;
+        float4 nb = coX * rDirX + coY * rDirY + coZ * rDirZ;
+        float4 c = coX * coX + coY * coY + coZ * coZ - sSqRadius;
+        float4 discr = nb * nb - c;
+        bool4 discrPos = discr > float4(0.0f);
+        // if ray hits any of the 4 spheres
+        if (any(discrPos))
+        {
+            float4 discrSq = sqrtf(discr);
+
+            // ray could hit spheres at t0 & t1
+            float4 t0 = nb - discrSq;
+            float4 t1 = nb + discrSq;
+
+            float4 t = select(t1, t0, t0 > tMin4); // if t0 is above min, take it (since it's the earlier hit); else try t1.
+            bool4 msk = discrPos & (t > tMin4) & (t < hitT);
+            // if hit, take it
+            id = select(id, curId, msk);
+            hitT = select(hitT, t, msk);
+        }
+#if USE_NEON
+        curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth));
+#else
+        curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth));
+#endif
+    }
+    // now we have up to 4 hits, find and return closest one
+    float minT = hmin(hitT);
+    if (minT < tMax) // any actual hits?
+    {
+        int minMask = mask(hitT == float4(minT));
+        if (minMask != 0)
+        {
+            int id_scalar[4];
+            float hitT_scalar[4];
+#if USE_NEON
+            vst1q_s32(id_scalar, id);
+            vst1q_f32(hitT_scalar, hitT.m);
+#else
+            _mm_storeu_si128((__m128i *)id_scalar, id);
+            _mm_storeu_ps(hitT_scalar, hitT.m);
+#endif
+
+            // In general, you would do this with a bit scan (first set/trailing zero count).
+            // But who cares, it's only 16 options.
+            static const int laneId[16] =
+            {
+                0, 0, 1, 0, // 00xx
+                2, 0, 1, 0, // 01xx
+                3, 0, 1, 0, // 10xx
+                2, 0, 1, 0, // 11xx
+            };
+
+            int lane = laneId[minMask];
+            int hitId = id_scalar[lane];
+            float finalHitT = hitT_scalar[lane];
+
+            outHit.pos = r.pointAt(finalHitT);
+            outHit.normal = (outHit.pos - float3(spheres.centerX[hitId], spheres.centerY[hitId], spheres.centerZ[hitId])) * spheres.invRadius[hitId];
+            outHit.t = finalHitT;
+            return hitId;
+        }
+    }
+
+    return -1;
+
+#else // #if DO_HIT_SPHERES_SIMD
+
+    float hitT = tMax;
+    int id = -1;
+    for (int i = 0; i < spheres.count; ++i)
+    {
+        float coX = spheres.centerX[i] - r.orig.getX();
+        float coY = spheres.centerY[i] - r.orig.getY();
+        float coZ = spheres.centerZ[i] - r.orig.getZ();
+        float nb = coX * r.dir.getX() + coY * r.dir.getY() + coZ * r.dir.getZ();
+        float c = coX * coX + coY * coY + coZ * coZ - spheres.sqRadius[i];
+        float discr = nb * nb - c;
+        if (discr > 0)
+        {
+            float discrSq = sqrtf(discr);
+
+            // Try earlier t
+            float t = nb - discrSq;
+            if (t <= tMin) // before min, try later t!
+                t = nb + discrSq;
+
+            if (t > tMin && t < hitT)
+            {
+                id = i;
+                hitT = t;
+            }
+        }
+    }
+    if (id != -1)
+    {
+        outHit.pos = r.pointAt(hitT);
+        outHit.normal = (outHit.pos - float3(spheres.centerX[id], spheres.centerY[id], spheres.centerZ[id])) * spheres.invRadius[id];
+        outHit.t = hitT;
+        return id;
+    }
+    else
+        return -1;
+#endif // #else of #if DO_HIT_SPHERES_SIMD
+}
--- a/examples/ToyPathTracer/Source/Maths.h
+++ b/examples/ToyPathTracer/Source/Maths.h
@ -0,0 +1,436 @@
+#pragma once
+
+#include <math.h>
+#include <assert.h>
+#include <stdint.h>
+#include "Config.h"
+#include "MathSimd.h"
+
+#define kPI 3.1415926f
+
+// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
+#if DO_FLOAT3_WITH_SIMD
+
+
+#if !defined(__arm__) && !defined(__arm64__)
+
+// ---- SSE implementation
+
+// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
+// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
+#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))
+
+struct float3
+{
+    VM_INLINE float3() {}
+    VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
+    VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
+    VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }
+    VM_INLINE explicit float3(__m128 v) { m = v; }
+
+    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
+    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
+    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }
+
+    VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
+    VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }
+
+    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
+
+    void setX(float x)
+    {
+        m = _mm_move_ss(m, _mm_set_ss(x));
+    }
+    void setY(float y)
+    {
+        __m128 t = _mm_move_ss(m, _mm_set_ss(y));
+        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
+        m = _mm_move_ss(t, m);
+    }
+    void setZ(float z)
+    {
+        __m128 t = _mm_move_ss(m, _mm_set_ss(z));
+        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
+        m = _mm_move_ss(t, m);
+    }
+
+    __m128 m;
+};
+
+typedef float3 bool3;
+
+VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
+VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
+VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
+VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
+VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
+VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
+VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
+VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
+VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
+VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
+VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }
+
+VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }
+
+VM_INLINE float hmin(float3 v)
+{
+    v = min(v, SHUFFLE3(v, 1, 0, 2));
+    return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
+}
+VM_INLINE float hmax(float3 v)
+{
+    v = max(v, SHUFFLE3(v, 1, 0, 2));
+    return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
+}
+
+VM_INLINE float3 cross(float3 a, float3 b)
+{
+    // x  <-  a.y*b.z - a.z*b.y
+    // y  <-  a.z*b.x - a.x*b.z
+    // z  <-  a.x*b.y - a.y*b.x
+    // We can save a shuffle by grouping it in this wacky order:
+    return (a.zxy()*b - a*b.zxy()).zxy();
+}
+
+// Returns a 3-bit code where bit0..bit2 is X..Z
+VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
+
+VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
+VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
+VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
+
+#else // #if !defined(__arm__) && !defined(__arm64__)
+
+// ---- NEON implementation
+
+#include <arm_neon.h>
+
+struct float3
+{
+    VM_INLINE float3() {}
+    VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }
+    VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }
+    VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }
+    VM_INLINE explicit float3(float32x4_t v) { m = v; }
+    
+    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
+    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
+    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
+    
+    VM_INLINE float3 yzx() const
+    {
+        float32x2_t low = vget_low_f32(m);
+        float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);
+        return float3(yzx);
+    }
+    VM_INLINE float3 zxy() const
+    {
+        float32x4_t p = m;
+        p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];
+        return float3(p);
+    }
+    
+    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
+    
+    void setX(float x)
+    {
+        m = vsetq_lane_f32(x, m, 0);
+    }
+    void setY(float y)
+    {
+        m = vsetq_lane_f32(y, m, 1);
+    }
+    void setZ(float z)
+    {
+        m = vsetq_lane_f32(z, m, 2);
+    }
+    
+    float32x4_t m;
+};
+
+typedef float3 bool3;
+
+VM_INLINE float32x4_t rcp_2(float32x4_t v)
+{
+    float32x4_t e = vrecpeq_f32(v);
+    e = vmulq_f32(vrecpsq_f32(e, v), e);
+    e = vmulq_f32(vrecpsq_f32(e, v), e);
+    return e;
+}
+
+VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }
+VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }
+VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }
+VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }
+VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }
+VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }
+VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }
+VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }
+VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
+VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
+VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
+VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
+VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
+VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }
+VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
+VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }
+VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }
+
+VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }
+
+VM_INLINE float hmin(float3 v)
+{
+    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
+    return vget_lane_f32(minOfMinOfHalfs, 0);
+}
+VM_INLINE float hmax(float3 v)
+{
+    float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));
+    float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);
+    return vget_lane_f32(maxOfMaxOfHalfs, 0);
+}
+
+VM_INLINE float3 cross(float3 a, float3 b)
+{
+    // x  <-  a.y*b.z - a.z*b.y
+    // y  <-  a.z*b.x - a.x*b.z
+    // z  <-  a.x*b.y - a.y*b.x
+    // We can save a shuffle by grouping it in this wacky order:
+    return (a.zxy()*b - a*b.zxy()).zxy();
+}
+
+// Returns a 3-bit code where bit0..bit2 is X..Z
+VM_INLINE unsigned mask(float3 v)
+{
+    static const uint32x4_t movemask = { 1, 2, 4, 8 };
+    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
+    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
+    uint32x4_t t1 = vtstq_u32(t0, highbit);
+    uint32x4_t t2 = vandq_u32(t1, movemask);
+    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
+    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
+}
+// Once we have a comparison, we can branch based on its results:
+VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
+VM_INLINE bool all(bool3 v) { return mask(v) == 7; }
+
+VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
+VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
+VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }
+
+
+#endif // #else of #if !defined(__arm__) && !defined(__arm64__)
+
+#else // #if DO_FLOAT3_WITH_SIMD
+
+// ---- Simple scalar C implementation
+
+
+struct float3
+{
+    float3() : x(0), y(0), z(0) {}
+    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+
+    float3 operator-() const { return float3(-x, -y, -z); }
+    float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
+    float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
+    float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
+    float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }
+
+    VM_INLINE float getX() const { return x; }
+    VM_INLINE float getY() const { return y; }
+    VM_INLINE float getZ() const { return z; }
+    VM_INLINE void setX(float x_) { x = x_; }
+    VM_INLINE void setY(float y_) { y = y_; }
+    VM_INLINE void setZ(float z_) { z = z_; }
+    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
+
+    float x, y, z;
+};
+
+VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
+VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
+VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
+VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
+VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
+VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
+VM_INLINE float3 cross(const float3& a, const float3& b)
+{
+    return float3(
+                  a.y*b.z - a.z*b.y,
+                  -(a.x*b.z - a.z*b.x),
+                  a.x*b.y - a.y*b.x
+                  );
+}
+#endif // #else of #if DO_FLOAT3_WITH_SIMD
+
+VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
+VM_INLINE float sqLength(float3 v) { return dot(v, v); }
+VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
+VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }
+
+
+inline void AssertUnit(float3 v)
+{
+    assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
+}
+
+inline float3 reflect(float3 v, float3 n)
+{
+    return v - 2*dot(v,n)*n;
+}
+
+inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)
+{
+    AssertUnit(v);
+    float dt = dot(v, n);
+    float discr = 1.0f - nint*nint*(1-dt*dt);
+    if (discr > 0)
+    {
+        outRefracted = nint * (v - n*dt) - n*sqrtf(discr);
+        return true;
+    }
+    return false;
+}
+inline float schlick(float cosine, float ri)
+{
+    float r0 = (1-ri) / (1+ri);
+    r0 = r0*r0;
+    return r0 + (1-r0)*powf(1-cosine, 5);
+}
+
+struct Ray
+{
+    Ray() {}
+    Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }
+
+    float3 pointAt(float t) const { return orig + dir * t; }
+
+    float3 orig;
+    float3 dir;
+};
+
+
+struct Hit
+{
+    float3 pos;
+    float3 normal;
+    float t;
+};
+
+
+struct Sphere
+{
+    Sphere() : radius(1.0f), invRadius(0.0f) {}
+    Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}
+
+    void UpdateDerivedData() { invRadius = 1.0f/radius; }
+
+    float3 center;
+    float radius;
+    float invRadius;
+};
+
+
+// data for all spheres in a "structure of arrays" layout
+struct SpheresSoA
+{
+    SpheresSoA(int c)
+    {
+        count = c;
+        // we'll be processing spheres in kSimdWidth chunks, so make sure to allocate
+        // enough space
+        simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;
+        centerX = new float[simdCount];
+        centerY = new float[simdCount];
+        centerZ = new float[simdCount];
+        sqRadius = new float[simdCount];
+        invRadius = new float[simdCount];
+        // set all data to "impossible sphere" state
+        for (int i = count; i < simdCount; ++i)
+        {
+            centerX[i] = centerY[i] = centerZ[i] = 10000.0f;
+            sqRadius[i] = 0.0f;
+            invRadius[i] = 0.0f;
+        }
+    }
+    ~SpheresSoA()
+    {
+        delete[] centerX;
+        delete[] centerY;
+        delete[] centerZ;
+        delete[] sqRadius;
+        delete[] invRadius;
+    }
+    float* centerX;
+    float* centerY;
+    float* centerZ;
+    float* sqRadius;
+    float* invRadius;
+    int simdCount;
+    int count;
+};
+
+
+int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);
+
+float RandomFloat01(uint32_t& state);
+float3 RandomInUnitDisk(uint32_t& state);
+float3 RandomInUnitSphere(uint32_t& state);
+float3 RandomUnitVector(uint32_t& state);
+
+struct Camera
+{
+    Camera() {}
+    // vfov is top to bottom in degrees
+    Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)
+    {
+        lensRadius = aperture / 2;
+        float theta = vfov*kPI/180;
+        float halfHeight = tanf(theta/2);
+        float halfWidth = aspect * halfHeight;
+        origin = lookFrom;
+        w = normalize(lookFrom - lookAt);
+        u = normalize(cross(vup, w));
+        v = cross(w, u);
+        lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w;
+        horizontal = 2*halfWidth*focusDist*u;
+        vertical = 2*halfHeight*focusDist*v;
+    }
+
+    Ray GetRay(float s, float t, uint32_t& state) const
+    {
+        float3 rd = lensRadius * RandomInUnitDisk(state);
+        float3 offset = u * rd.getX() + v * rd.getY();
+        return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
+    }
+
+    float3 origin;
+    float3 lowerLeftCorner;
+    float3 horizontal;
+    float3 vertical;
+    float3 u, v, w;
+    float lensRadius;
+};
+
--- a/examples/ToyPathTracer/Source/Test.cpp
+++ b/examples/ToyPathTracer/Source/Test.cpp
@ -0,0 +1,380 @@
+#include "Config.h"
+#include "Test.h"
+#include "Maths.h"
+#include <algorithm>
+#if CPU_CAN_DO_THREADS
+#include "enkiTS/TaskScheduler_c.h"
+#endif
+#include <atomic>
+
+// 46 spheres (2 emissive) when enabled; 9 spheres (1 emissive) when disabled
+#define DO_BIG_SCENE 1
+
+static Sphere s_Spheres[] =
+{
+    {float3(0,-100.5,-1), 100},
+    {float3(2,0,-1), 0.5f},
+    {float3(0,0,-1), 0.5f},
+    {float3(-2,0,-1), 0.5f},
+    {float3(2,0,1), 0.5f},
+    {float3(0,0,1), 0.5f},
+    {float3(-2,0,1), 0.5f},
+    {float3(0.5f,1,0.5f), 0.5f},
+    {float3(-1.5f,1.5f,0.f), 0.3f},
+#if DO_BIG_SCENE
+    {float3(4,0,-3), 0.5f}, {float3(3,0,-3), 0.5f}, {float3(2,0,-3), 0.5f}, {float3(1,0,-3), 0.5f}, {float3(0,0,-3), 0.5f}, {float3(-1,0,-3), 0.5f}, {float3(-2,0,-3), 0.5f}, {float3(-3,0,-3), 0.5f}, {float3(-4,0,-3), 0.5f},
+    {float3(4,0,-4), 0.5f}, {float3(3,0,-4), 0.5f}, {float3(2,0,-4), 0.5f}, {float3(1,0,-4), 0.5f}, {float3(0,0,-4), 0.5f}, {float3(-1,0,-4), 0.5f}, {float3(-2,0,-4), 0.5f}, {float3(-3,0,-4), 0.5f}, {float3(-4,0,-4), 0.5f},
+    {float3(4,0,-5), 0.5f}, {float3(3,0,-5), 0.5f}, {float3(2,0,-5), 0.5f}, {float3(1,0,-5), 0.5f}, {float3(0,0,-5), 0.5f}, {float3(-1,0,-5), 0.5f}, {float3(-2,0,-5), 0.5f}, {float3(-3,0,-5), 0.5f}, {float3(-4,0,-5), 0.5f},
+    {float3(4,0,-6), 0.5f}, {float3(3,0,-6), 0.5f}, {float3(2,0,-6), 0.5f}, {float3(1,0,-6), 0.5f}, {float3(0,0,-6), 0.5f}, {float3(-1,0,-6), 0.5f}, {float3(-2,0,-6), 0.5f}, {float3(-3,0,-6), 0.5f}, {float3(-4,0,-6), 0.5f},
+    {float3(1.5f,1.5f,-2), 0.3f},
+#endif // #if DO_BIG_SCENE
+};
+const int kSphereCount = sizeof(s_Spheres) / sizeof(s_Spheres[0]);
+
+static SpheresSoA s_SpheresSoA(kSphereCount);
+
+struct Material
+{
+    enum Type { Lambert, Metal, Dielectric };
+    Type type;
+    float3 albedo;
+    float3 emissive;
+    float roughness;
+    float ri;
+};
+
+static Material s_SphereMats[kSphereCount] =
+{
+    { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.8f, 0.4f, 0.4f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0, },
+    { Material::Metal, float3(0.4f, 0.4f, 0.8f), float3(0,0,0), 0, 0 },
+    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0 },
+    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.2f, 0 },
+    { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.6f, 0 },
+    { Material::Dielectric, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 1.5f },
+    { Material::Lambert, float3(0.8f, 0.6f, 0.2f), float3(30,25,15), 0, 0 },
+#if DO_BIG_SCENE
+    { Material::Lambert, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
+    { Material::Metal, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, },
+    { Material::Metal, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, },
+    { Material::Lambert, float3(0.1f, 0.2f, 0.5f), float3(3,10,20), 0, 0 },
+#endif
+};
+
+static int s_EmissiveSpheres[kSphereCount];
+static int s_EmissiveSphereCount;
+
+static Camera s_Cam;
+
+const float kMinT = 0.001f;
+const float kMaxT = 1.0e7f;
+const int kMaxDepth = 10;
+
+
+bool HitWorld(const Ray& r, float tMin, float tMax, Hit& outHit, int& outID)
+{
+    outID = HitSpheres(r, s_SpheresSoA, tMin, tMax, outHit);
+    return outID != -1;
+}
+
+
+static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3& attenuation, Ray& scattered, float3& outLightE, int& inoutRayCount, uint32_t& state)
+{
+    outLightE = float3(0,0,0);
+    if (mat.type == Material::Lambert)
+    {
+        // random point on unit sphere that is tangent to the hit point
+        float3 target = rec.pos + rec.normal + RandomUnitVector(state);
+        scattered = Ray(rec.pos, normalize(target - rec.pos));
+        attenuation = mat.albedo;
+
+        // sample lights
+#if DO_LIGHT_SAMPLING
+        for (int j = 0; j < s_EmissiveSphereCount; ++j)
+        {
+            int i = s_EmissiveSpheres[j];
+            const Material& smat = s_SphereMats[i];
+            if (&mat == &smat)
+                continue; // skip self
+            const Sphere& s = s_Spheres[i];
+
+            // create a random direction towards sphere
+            // coord system for sampling: sw, su, sv
+            float3 sw = normalize(s.center - rec.pos);
+            float3 su = normalize(cross(fabs(sw.getX())>0.01f ? float3(0,1,0):float3(1,0,0), sw));
+            float3 sv = cross(sw, su);
+            // sample sphere by solid angle
+            float cosAMax = sqrtf(1.0f - s.radius*s.radius / sqLength(rec.pos-s.center));
+            float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
+            float cosA = 1.0f - eps1 + eps1 * cosAMax;
+            float sinA = sqrtf(1.0f - cosA*cosA);
+            float phi = 2 * kPI * eps2;
+            float3 l = su * (cosf(phi) * sinA) + sv * (sinf(phi) * sinA) + sw * cosA;
+            //l = normalize(l); // NOTE(fg): This is already normalized, by construction.
+
+            // shoot shadow ray
+            Hit lightHit;
+            int hitID;
+            ++inoutRayCount;
+            if (HitWorld(Ray(rec.pos, l), kMinT, kMaxT, lightHit, hitID) && hitID == i)
+            {
+                float omega = 2 * kPI * (1-cosAMax);
+
+                float3 rdir = r_in.dir;
+                AssertUnit(rdir);
+                float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal;
+                outLightE += (mat.albedo * smat.emissive) * (std::max(0.0f, dot(l, nl)) * omega / kPI);
+            }
+        }
+#endif
+        return true;
+    }
+    else if (mat.type == Material::Metal)
+    {
+        AssertUnit(r_in.dir); AssertUnit(rec.normal);
+        float3 refl = reflect(r_in.dir, rec.normal);
+        // reflected ray, and random inside of sphere based on roughness
+        float roughness = mat.roughness;
+#if DO_MITSUBA_COMPARE
+        roughness = 0; // until we get better BRDF for metals
+#endif
+        scattered = Ray(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state)));
+        attenuation = mat.albedo;
+        return dot(scattered.dir, rec.normal) > 0;
+    }
+    else if (mat.type == Material::Dielectric)
+    {
+        AssertUnit(r_in.dir); AssertUnit(rec.normal);
+        float3 outwardN;
+        float3 rdir = r_in.dir;
+        float3 refl = reflect(rdir, rec.normal);
+        float nint;
+        attenuation = float3(1,1,1);
+        float3 refr;
+        float reflProb;
+        float cosine;
+        if (dot(rdir, rec.normal) > 0)
+        {
+            outwardN = -rec.normal;
+            nint = mat.ri;
+            cosine = mat.ri * dot(rdir, rec.normal);
+        }
+        else
+        {
+            outwardN = rec.normal;
+            nint = 1.0f / mat.ri;
+            cosine = -dot(rdir, rec.normal);
+        }
+        if (refract(rdir, outwardN, nint, refr))
+        {
+            reflProb = schlick(cosine, mat.ri);
+        }
+        else
+        {
+            reflProb = 1;
+        }
+        if (RandomFloat01(state) < reflProb)
+            scattered = Ray(rec.pos, normalize(refl));
+        else
+            scattered = Ray(rec.pos, normalize(refr));
+    }
+    else
+    {
+        attenuation = float3(1,0,1);
+        return false;
+    }
+    return true;
+}
+
+static float3 Trace(const Ray& r, int depth, int& inoutRayCount, uint32_t& state, bool doMaterialE = true)
+{
+    Hit rec;
+    int id = 0;
+    ++inoutRayCount;
+    if (HitWorld(r, kMinT, kMaxT, rec, id))
+    {
+        Ray scattered;
+        float3 attenuation;
+        float3 lightE;
+        const Material& mat = s_SphereMats[id];
+        float3 matE = mat.emissive;
+        if (depth < kMaxDepth && Scatter(mat, r, rec, attenuation, scattered, lightE, inoutRayCount, state))
+        {
+#if DO_LIGHT_SAMPLING
+            if (!doMaterialE) matE = float3(0,0,0); // don't add material emission if told so
+            // dor Lambert materials, we just did explicit light (emissive) sampling and already
+            // for their contribution, so if next ray bounce hits the light again, don't add
+            // emission
+            doMaterialE = (mat.type != Material::Lambert);
+#endif
+            return matE + lightE + attenuation * Trace(scattered, depth+1, inoutRayCount, state, doMaterialE);
+        }
+        else
+        {
+            return matE;
+        }
+    }
+    else
+    {
+        // sky
+#if DO_MITSUBA_COMPARE
+        return float3(0.15f,0.21f,0.3f); // easier compare with Mitsuba's constant environment light
+#else
+        float3 unitDir = r.dir;
+        float t = 0.5f*(unitDir.getY() + 1.0f);
+        return ((1.0f-t)*float3(1.0f, 1.0f, 1.0f) + t*float3(0.5f, 0.7f, 1.0f)) * 0.3f;
+#endif
+    }
+}
+
+#if CPU_CAN_DO_THREADS
+static enkiTaskScheduler* g_TS;
+#endif
+
+void InitializeTest()
+{
+    #if CPU_CAN_DO_THREADS
+    g_TS = enkiNewTaskScheduler();
+    enkiInitTaskScheduler(g_TS);
+    #endif
+}
+
+void ShutdownTest()
+{
+    #if CPU_CAN_DO_THREADS
+    enkiDeleteTaskScheduler(g_TS);
+    #endif
+}
+
+struct JobData
+{
+    float time;
+    int frameCount;
+    int screenWidth, screenHeight;
+    float* backbuffer;
+    Camera* cam;
+    std::atomic<int> rayCount;
+    unsigned testFlags;
+};
+
+static void TraceRowJob(uint32_t start, uint32_t end, uint32_t threadnum, void* data_)
+{
+    JobData& data = *(JobData*)data_;
+    float* backbuffer = data.backbuffer + start * data.screenWidth * 4;
+    float invWidth = 1.0f / data.screenWidth;
+    float invHeight = 1.0f / data.screenHeight;
+    float lerpFac = float(data.frameCount) / float(data.frameCount+1);
+    if (data.testFlags & kFlagAnimate)
+        lerpFac *= DO_ANIMATE_SMOOTHING;
+    if (!(data.testFlags & kFlagProgressive))
+        lerpFac = 0;
+    int rayCount = 0;
+    for (uint32_t y = start; y < end; ++y)
+    {
+        uint32_t state = (y * 9781 + data.frameCount * 6271) | 1;
+        for (int x = 0; x < data.screenWidth; ++x)
+        {
+            float3 col(0, 0, 0);
+            for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++)
+            {
+                float u = float(x + RandomFloat01(state)) * invWidth;
+                float v = float(y + RandomFloat01(state)) * invHeight;
+                Ray r = data.cam->GetRay(u, v, state);
+                col += Trace(r, 0, rayCount, state);
+            }
+            col *= 1.0f / float(DO_SAMPLES_PER_PIXEL);
+
+            float3 prev(backbuffer[0], backbuffer[1], backbuffer[2]);
+            col = prev * lerpFac + col * (1-lerpFac);
+            col.store(backbuffer);
+            backbuffer += 4;
+        }
+    }
+    data.rayCount += rayCount;
+}
+
+void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags)
+{
+    if (testFlags & kFlagAnimate)
+    {
+        s_Spheres[1].center.setY(cosf(time) + 1.0f);
+        s_Spheres[8].center.setZ(sinf(time)*0.3f);
+    }
+    float3 lookfrom(0, 2, 3);
+    float3 lookat(0, 0, 0);
+    float distToFocus = 3;
+#if DO_MITSUBA_COMPARE
+    float aperture = 0.0f;
+#else
+    float aperture = 0.1f;
+#endif
+#if DO_BIG_SCENE
+    aperture *= 0.2f;
+#endif
+
+    s_EmissiveSphereCount = 0;
+    for (int i = 0; i < kSphereCount; ++i)
+    {
+        Sphere& s = s_Spheres[i];
+        s.UpdateDerivedData();
+        s_SpheresSoA.centerX[i] = s.center.getX();
+        s_SpheresSoA.centerY[i] = s.center.getY();
+        s_SpheresSoA.centerZ[i] = s.center.getZ();
+        s_SpheresSoA.sqRadius[i] = s.radius * s.radius;
+        s_SpheresSoA.invRadius[i] = s.invRadius;
+
+        // Remember IDs of emissive spheres (light sources)
+        const Material& smat = s_SphereMats[i];
+        if (smat.emissive.getX() > 0 || smat.emissive.getY() > 0 || smat.emissive.getZ() > 0)
+        {
+            s_EmissiveSpheres[s_EmissiveSphereCount] = i;
+            s_EmissiveSphereCount++;
+        }
+    }
+
+    s_Cam = Camera(lookfrom, lookat, float3(0, 1, 0), 60, float(screenWidth) / float(screenHeight), aperture, distToFocus);
+}
+
+void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags)
+{
+    JobData args;
+    args.time = time;
+    args.frameCount = frameCount;
+    args.screenWidth = screenWidth;
+    args.screenHeight = screenHeight;
+    args.backbuffer = backbuffer;
+    args.cam = &s_Cam;
+    args.testFlags = testFlags;
+    args.rayCount = 0;
+
+    #if CPU_CAN_DO_THREADS
+    enkiTaskSet* task = enkiCreateTaskSet(g_TS, TraceRowJob);
+    bool threaded = true;
+    enkiAddTaskSetToPipeMinRange(g_TS, task, &args, screenHeight, threaded ? 4 : screenHeight);
+    enkiWaitForTaskSet(g_TS, task);
+    enkiDeleteTaskSet(task);
+    #else
+    TraceRowJob(0, screenHeight, 0, &args);
+    #endif
+
+    outRayCount = args.rayCount;
+}
+
+void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize)
+{
+    outCount = kSphereCount;
+    outObjectSize = sizeof(Sphere);
+    outMaterialSize = sizeof(Material);
+    outCamSize = sizeof(Camera);
+}
+
+void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount)
+{
+    memcpy(outObjects, s_Spheres, kSphereCount * sizeof(s_Spheres[0]));
+    memcpy(outMaterials, s_SphereMats, kSphereCount * sizeof(s_SphereMats[0]));
+    memcpy(outCam, &s_Cam, sizeof(s_Cam));
+    memcpy(outEmissives, s_EmissiveSpheres, s_EmissiveSphereCount * sizeof(s_EmissiveSpheres[0]));
+    *outEmissiveCount = s_EmissiveSphereCount;
+}
--- a/examples/ToyPathTracer/Source/Test.h
+++ b/examples/ToyPathTracer/Source/Test.h
@ -0,0 +1,17 @@
+#pragma once
+#include <stdint.h>
+
+enum TestFlags
+{
+    kFlagAnimate = (1 << 0),
+    kFlagProgressive = (1 << 1),
+};
+
+void InitializeTest();
+void ShutdownTest();
+
+void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags);
+void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags);
+
+void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize);
+void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount);
--- a/examples/ToyPathTracer/Source/enkiTS/Atomics.h
+++ b/examples/ToyPathTracer/Source/enkiTS/Atomics.h
@ -0,0 +1,79 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef _WIN32
+    #define WIN32_LEAN_AND_MEAN
+    #include <Windows.h>
+	#undef GetObject
+    #include <intrin.h>
+
+    extern "C" void _ReadWriteBarrier();
+    #pragma intrinsic(_ReadWriteBarrier)
+    #pragma intrinsic(_InterlockedCompareExchange)
+    #pragma intrinsic(_InterlockedExchangeAdd)
+
+    // Memory Barriers to prevent CPU and Compiler re-ordering
+    #define BASE_MEMORYBARRIER_ACQUIRE() _ReadWriteBarrier()
+    #define BASE_MEMORYBARRIER_RELEASE() _ReadWriteBarrier()
+    #define BASE_ALIGN(x) __declspec( align( x ) ) 
+
+#else
+    #define BASE_MEMORYBARRIER_ACQUIRE() __asm__ __volatile__("": : :"memory")  
+    #define BASE_MEMORYBARRIER_RELEASE() __asm__ __volatile__("": : :"memory")  
+	#define BASE_ALIGN(x)  __attribute__ ((aligned( x )))
+#endif
+
+namespace enki
+{
+    // Atomically performs: if( *pDest == compareWith ) { *pDest = swapTo; }
+    // returns old *pDest (so if successfull, returns compareWith)
+    inline uint32_t AtomicCompareAndSwap( volatile uint32_t* pDest, uint32_t swapTo, uint32_t compareWith )
+    {
+       #ifdef _WIN32
+			// assumes two's complement - unsigned / signed conversion leads to same bit pattern
+            return _InterlockedCompareExchange( (volatile long*)pDest,swapTo, compareWith );
+        #else
+            return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
+        #endif      
+    }
+
+    inline uint64_t AtomicCompareAndSwap( volatile uint64_t* pDest, uint64_t swapTo, uint64_t compareWith )
+    {
+       #ifdef _WIN32
+			// assumes two's complement - unsigned / signed conversion leads to same bit pattern
+            return _InterlockedCompareExchange64( (__int64 volatile*)pDest, swapTo, compareWith );
+        #else
+            return __sync_val_compare_and_swap( pDest, compareWith, swapTo );
+        #endif      
+    }	
+
+    // Atomically performs: tmp = *pDest; *pDest += value; return tmp;
+    inline int32_t AtomicAdd( volatile int32_t* pDest, int32_t value )
+    {
+       #ifdef _WIN32
+            return _InterlockedExchangeAdd( (long*)pDest, value );
+        #else
+            return __sync_fetch_and_add( pDest, value );
+        #endif      
+    }
+
+}
--- a/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h
+++ b/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h
@ -0,0 +1,240 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+#include <assert.h>
+
+#include "Atomics.h"
+#include <string.h>
+
+
+namespace enki
+{
+    // LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming
+    // Readers can only read from the back of the pipe
+    // The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader)
+    // for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx
+    // Note: using log2 sizes so we do not need to clamp (multi-operation)
+    // T is the contained type
+    // Note this is not true lockless as the use of flags as a form of lock state.
+    template<uint8_t cSizeLog2, typename T> class LockLessMultiReadPipe
+    {
+    public:
+        LockLessMultiReadPipe();
+        ~LockLessMultiReadPipe() {}
+
+        // ReaderTryReadBack returns false if we were unable to read
+        // This is thread safe for both multiple readers and the writer
+        bool ReaderTryReadBack(   T* pOut );
+
+        // WriterTryReadFront returns false if we were unable to read
+        // This is thread safe for the single writer, but should not be called by readers
+        bool WriterTryReadFront(  T* pOut );
+
+        // WriterTryWriteFront returns false if we were unable to write
+        // This is thread safe for the single writer, but should not be called by readers
+        bool WriterTryWriteFront( const T& in );
+
+        // IsPipeEmpty() is a utility function, not intended for general use
+        // Should only be used very prudently.
+        bool IsPipeEmpty() const
+        {
+            return 0 == m_WriteIndex - m_ReadCount;
+        }
+
+        void Clear()
+        {
+            m_WriteIndex = 0;
+            m_ReadIndex = 0;
+            m_ReadCount = 0;
+            memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
+        }
+
+    private:
+        const static uint32_t           ms_cSize        = ( 1 << cSizeLog2 );
+        const static uint32_t           ms_cIndexMask   = ms_cSize - 1;
+        const static uint32_t           FLAG_INVALID    = 0xFFFFFFFF; // 32bit for CAS
+        const static uint32_t           FLAG_CAN_WRITE  = 0x00000000; // 32bit for CAS
+        const static uint32_t           FLAG_CAN_READ   = 0x11111111; // 32bit for CAS
+
+        T                               m_Buffer[ ms_cSize ];
+
+        // read and write indexes allow fast access to the pipe, but actual access
+        // controlled by the access flags. 
+        volatile uint32_t BASE_ALIGN(4) m_WriteIndex;
+        volatile uint32_t BASE_ALIGN(4) m_ReadCount;
+        volatile uint32_t               m_Flags[  ms_cSize ];
+        volatile uint32_t BASE_ALIGN(4) m_ReadIndex;
+    };
+
+    template<uint8_t cSizeLog2, typename T> inline
+        LockLessMultiReadPipe<cSizeLog2,T>::LockLessMultiReadPipe()
+        : m_WriteIndex(0)
+        , m_ReadIndex(0)
+        , m_ReadCount(0)
+    {
+        assert( cSizeLog2 < 32 );
+        memset( (void*)m_Flags, 0, sizeof( m_Flags ) );
+    }
+
+    template<uint8_t cSizeLog2, typename T> inline
+        bool LockLessMultiReadPipe<cSizeLog2,T>::ReaderTryReadBack(   T* pOut )
+    {
+
+        uint32_t actualReadIndex;
+
+        uint32_t readCount  = m_ReadCount;
+
+        // We get hold of read index for consistency,
+        // and do first pass starting at read count
+        uint32_t readIndexToUse  = readCount;
+
+
+        while(true)
+        {
+
+            uint32_t writeIndex = m_WriteIndex;
+            // power of two sizes ensures we can use a simple calc without modulus
+            uint32_t numInPipe = writeIndex - readCount;
+            if( 0 == numInPipe )
+            {
+                return false;
+            }
+            if( readIndexToUse >= writeIndex )
+            {
+                // move back to start
+                readIndexToUse = m_ReadIndex;
+            }
+
+
+            // power of two sizes ensures we can perform AND for a modulus
+            actualReadIndex    = readIndexToUse & ms_cIndexMask;
+
+            // Multiple potential readers mean we should check if the data is valid,
+            // using an atomic compare exchange
+            uint32_t previous = AtomicCompareAndSwap( &m_Flags[  actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
+            if( FLAG_CAN_READ == previous )
+            {
+                break;
+            }
+            ++readIndexToUse;
+
+            //update known readcount
+            readCount  = m_ReadCount;
+        }
+
+        // we update the read index using an atomic add, as we've only read one piece of data.
+        // this ensure consistency of the read index, and the above loop ensures readers
+        // only read from unread data
+        AtomicAdd(  (volatile int32_t*)&m_ReadCount, 1 );
+
+        BASE_MEMORYBARRIER_ACQUIRE();
+        // now read data, ensuring we do so after above reads & CAS
+        *pOut = m_Buffer[ actualReadIndex ];
+
+        m_Flags[  actualReadIndex ] = FLAG_CAN_WRITE;
+
+        return true;
+    }
+
+    template<uint8_t cSizeLog2, typename T> inline
+        bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryReadFront(  T* pOut )
+    {
+        uint32_t writeIndex = m_WriteIndex;
+        uint32_t frontReadIndex  = writeIndex;
+
+        // Multiple potential readers mean we should check if the data is valid,
+        // using an atomic compare exchange - which acts as a form of lock (so not quite lockless really).
+        uint32_t previous = FLAG_INVALID;
+        uint32_t actualReadIndex = 0;
+        while( true )
+        {
+            // power of two sizes ensures we can use a simple calc without modulus
+            uint32_t readCount = m_ReadCount;
+            uint32_t numInPipe = writeIndex - readCount;
+            if( 0 == numInPipe || 0 == frontReadIndex )
+            {
+                // frontReadIndex can get to 0 here if that item was just being read by another thread.
+                m_ReadIndex = readCount;
+                return false;
+            }
+            --frontReadIndex;
+            actualReadIndex = frontReadIndex & ms_cIndexMask;
+            previous = AtomicCompareAndSwap( &m_Flags[  actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ );
+            if( FLAG_CAN_READ == previous )
+            {
+                break;
+            }
+            else if( m_ReadIndex >= frontReadIndex  )
+            {
+                return false;
+            }
+        }
+
+        // now read data, ensuring we do so after above reads & CAS
+        *pOut = m_Buffer[ actualReadIndex ];
+
+        m_Flags[  actualReadIndex ] = FLAG_CAN_WRITE;
+
+        BASE_MEMORYBARRIER_RELEASE();
+
+        // 32-bit aligned stores are atomic, and writer owns the write index
+        // we only move one back as this is as many as we have read, not where we have read from.
+        --m_WriteIndex;
+        return true;
+    }
+
+
+    template<uint8_t cSizeLog2, typename T> inline
+        bool LockLessMultiReadPipe<cSizeLog2,T>::WriterTryWriteFront( const T& in )
+    {
+        // The writer 'owns' the write index, and readers can only reduce
+        // the amount of data in the pipe.
+        // We get hold of both values for consistency and to reduce false sharing
+        // impacting more than one access
+        uint32_t writeIndex = m_WriteIndex;
+
+
+        // power of two sizes ensures we can perform AND for a modulus
+        uint32_t actualWriteIndex    = writeIndex & ms_cIndexMask;
+
+        // a reader may still be reading this item, as there are multiple readers
+        if( m_Flags[ actualWriteIndex ] != FLAG_CAN_WRITE ) 
+        {
+            return false; // still being read, so have caught up with tail. 
+        }
+
+
+        // as we are the only writer we can update the data without atomics
+        //  whilst the write index has not been updated
+        m_Buffer[ actualWriteIndex ] = in;
+        m_Flags[  actualWriteIndex ] = FLAG_CAN_READ;
+
+        // We need to ensure the above writes occur prior to updating the write index,
+        // otherwise another thread might read before it's finished
+        BASE_MEMORYBARRIER_RELEASE();
+
+        // 32-bit aligned stores are atomic, and the writer controls the write index
+        ++writeIndex;
+        m_WriteIndex = writeIndex;
+        return true;
+    }
+
+}
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp
@ -0,0 +1,437 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#include <assert.h>
+
+#include "TaskScheduler.h"
+#include "LockLessMultiReadPipe.h"
+
+
+
+using namespace enki;
+
+
+static const uint32_t PIPESIZE_LOG2              = 8;
+static const uint32_t SPIN_COUNT                 = 100;
+static const uint32_t SPIN_BACKOFF_MULTIPLIER    = 10;
+static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8;
+
+// each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable
+static THREAD_LOCAL uint32_t                             gtl_threadNum       = 0;
+
+namespace enki 
+{
+	struct SubTaskSet
+	{
+		ITaskSet*           pTask;
+		TaskSetPartition    partition;
+	};
+
+	// we derive class TaskPipe rather than typedef to get forward declaration working easily
+	class TaskPipe : public LockLessMultiReadPipe<PIPESIZE_LOG2,enki::SubTaskSet> {};
+
+	struct ThreadArgs
+	{
+		uint32_t		threadNum;
+		TaskScheduler*  pTaskScheduler;
+	};
+}
+
+namespace
+{
+	SubTaskSet       SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ )
+	{
+		SubTaskSet splitTask = subTask_;
+		uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start;
+
+        if( rangeToSplit_ > rangeLeft )
+        {
+            rangeToSplit_ = rangeLeft;
+        }
+        splitTask.partition.end = subTask_.partition.start + rangeToSplit_;
+		subTask_.partition.start = splitTask.partition.end;
+		return splitTask;
+	}
+
+	#if defined _WIN32
+		#if defined _M_IX86  || defined _M_X64
+			#pragma intrinsic(_mm_pause)
+			inline void Pause() { _mm_pause(); }
+		#endif
+	#elif defined __i386__ || defined __x86_64__
+		inline void Pause() { __asm__ __volatile__("pause;"); }
+	#else
+		inline void Pause() { ;} // may have NOP or yield equiv
+	#endif
+}
+
+
+static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_)
+{
+	if( func_ )
+	{
+		func_(threadnum_);
+	}
+}
+
+ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks()
+{
+	return &m_ProfilerCallbacks;
+}
+
+THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs )
+{
+	ThreadArgs args					= *(ThreadArgs*)pArgs;
+	uint32_t threadNum				= args.threadNum;
+	TaskScheduler*  pTS				= args.pTaskScheduler;
+    gtl_threadNum      = threadNum;
+
+	SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum );
+    
+    uint32_t spinCount = 0;
+	uint32_t hintPipeToCheck_io = threadNum + 1;	// does not need to be clamped.
+    while( pTS->m_bRunning )
+    {
+        if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) )
+        {
+            // no tasks, will spin then wait
+            ++spinCount;
+            if( spinCount > SPIN_COUNT )
+            {
+				pTS->WaitForTasks( threadNum );
+				spinCount = 0;
+            }
+			else
+			{
+				uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER;
+				while( spinBackoffCount )
+				{
+					Pause();
+					--spinBackoffCount;
+				}
+			}
+        }
+        else
+        {
+            spinCount = 0;
+        }
+    }
+
+    AtomicAdd( &pTS->m_NumThreadsRunning, -1 );
+	SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum );
+
+    return 0;
+}
+
+
+void TaskScheduler::StartThreads()
+{
+    if( m_bHaveThreads )
+    {
+        return;
+    }
+    m_bRunning = true;
+
+    SemaphoreCreate( m_NewTaskSemaphore );
+
+    // we create one less thread than m_NumThreads as the main thread counts as one
+    m_pThreadNumStore = new ThreadArgs[m_NumThreads];
+    m_pThreadIDs      = new threadid_t[m_NumThreads];
+	m_pThreadNumStore[0].threadNum      = 0;
+	m_pThreadNumStore[0].pTaskScheduler = this;
+	m_pThreadIDs[0] = 0;
+    m_NumThreadsWaiting = 0;
+    m_NumThreadsRunning = 1;// acount for main thread
+    for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
+    {
+		m_pThreadNumStore[thread].threadNum      = thread;
+		m_pThreadNumStore[thread].pTaskScheduler = this;
+        ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] );
+        ++m_NumThreadsRunning;
+    }
+
+    // ensure we have sufficient tasks to equally fill either all threads including main
+    // or just the threads we've launched, this is outside the firstinit as we want to be able
+    // to runtime change it
+	if( 1 == m_NumThreads )
+	{
+		m_NumPartitions = 1;
+		m_NumInitialPartitions = 1;
+	}
+	else
+	{
+		m_NumPartitions = m_NumThreads * (m_NumThreads - 1);
+		m_NumInitialPartitions = m_NumThreads - 1;
+		if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS )
+		{
+			m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS;
+		}
+	}
+
+    m_bHaveThreads = true;
+}
+
+void TaskScheduler::StopThreads( bool bWait_ )
+{
+    if( m_bHaveThreads )
+    {
+        // wait for them threads quit before deleting data
+        m_bRunning = false;
+        while( bWait_ && m_NumThreadsRunning > 1 )
+        {
+            // keep firing event to ensure all threads pick up state of m_bRunning
+            SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning );
+        }
+
+        for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
+        {
+            ThreadTerminate( m_pThreadIDs[thread] );
+        }
+
+		m_NumThreads = 0;
+        delete[] m_pThreadNumStore;
+        delete[] m_pThreadIDs;
+        m_pThreadNumStore = 0;
+        m_pThreadIDs = 0;
+        SemaphoreClose( m_NewTaskSemaphore );
+
+        m_bHaveThreads = false;
+		m_NumThreadsWaiting = 0;
+		m_NumThreadsRunning = 0;
+    }
+}
+
+bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ )
+{
+    // check for tasks
+    SubTaskSet subTask;
+    bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask );
+
+	uint32_t threadToCheck = hintPipeToCheck_io_;
+	uint32_t checkCount = 0;
+    while( !bHaveTask && checkCount < m_NumThreads )
+    {
+		threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads;
+		if( threadToCheck != threadNum )
+		{
+			bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask );
+		}
+		++checkCount;
+    }
+        
+    if( bHaveTask )
+    {
+		// update hint, will preserve value unless actually got task from another thread.
+		hintPipeToCheck_io_ = threadToCheck;
+
+		uint32_t partitionSize = subTask.partition.end - subTask.partition.start;
+		if( subTask.pTask->m_RangeToRun < partitionSize )
+		{
+			SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun );
+			SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 );
+			taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum );
+			AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 );
+		}
+		else
+		{
+
+			// the task has already been divided up by AddTaskSetToPipe, so just run it
+			subTask.pTask->ExecuteRange( subTask.partition, threadNum );
+			AtomicAdd( &subTask.pTask->m_RunningCount, -1 );
+		}
+    }
+
+    return bHaveTask;
+
+}
+
+void TaskScheduler::WaitForTasks( uint32_t threadNum )
+{
+	// We incrememt the number of threads waiting here in order
+	// to ensure that the check for tasks occurs after the increment
+	// to prevent a task being added after a check, then the thread waiting.
+	// This will occasionally result in threads being mistakenly awoken,
+	// but they will then go back to sleep.
+	AtomicAdd( &m_NumThreadsWaiting, 1 );
+
+    bool bHaveTasks = false;
+    for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
+    {
+        if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
+        {
+            bHaveTasks = true;
+            break;
+        }
+    }
+    if( !bHaveTasks )
+    {
+        SafeCallback( m_ProfilerCallbacks.waitStart, threadNum );
+        SemaphoreWait( m_NewTaskSemaphore );
+        SafeCallback( m_ProfilerCallbacks.waitStop, threadNum );
+    }
+
+    int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 );
+    assert( prev != 0 );
+}
+
+void TaskScheduler::WakeThreads()
+{
+	SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting );
+}
+
+void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
+	uint32_t rangeToSplit_, int32_t runningCountOffset_ )
+{
+    int32_t numAdded = 0;
+    while( subTask_.partition.start != subTask_.partition.end )
+    {
+        SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ );
+
+        // add the partition to the pipe
+        ++numAdded;
+        if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) )
+        {
+			if( numAdded > 1 )
+			{
+				WakeThreads();
+			}
+			// alter range to run the appropriate fraction
+			if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ )
+			{
+				taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun;
+				subTask_.partition.start = taskToAdd.partition.end;
+			}
+            taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ );
+            --numAdded;
+        }
+    }
+
+    // increment running count by number added
+    AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ );
+
+	WakeThreads();
+}
+
+void    TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet )
+{
+	// set running count to -1 to guarantee it won't be found complete until all subtasks added
+    pTaskSet->m_RunningCount = -1;
+
+    // divide task up and add to pipe
+    pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions;
+    if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; }
+
+	uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions;
+	if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; }
+
+    SubTaskSet subTask;
+    subTask.pTask = pTaskSet;
+    subTask.partition.start = 0;
+    subTask.partition.end = pTaskSet->m_SetSize;
+	SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 );
+}
+
+void    TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet )
+{
+	uint32_t hintPipeToCheck_io = gtl_threadNum + 1;	// does not need to be clamped.
+	if( pTaskSet )
+	{
+		while( pTaskSet->m_RunningCount )
+		{
+			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
+			// should add a spin then wait for task completion event.
+		}
+	}
+	else
+	{
+			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
+	}
+}
+
+void    TaskScheduler::WaitforAll()
+{
+    bool bHaveTasks = true;
+ 	uint32_t hintPipeToCheck_io = gtl_threadNum  + 1;	// does not need to be clamped.
+	int32_t threadsRunning = m_NumThreadsRunning - 1;
+    while( bHaveTasks || m_NumThreadsWaiting < threadsRunning )
+    {
+        TryRunTask( gtl_threadNum, hintPipeToCheck_io );
+        bHaveTasks = false;
+        for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
+        {
+            if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
+            {
+                bHaveTasks = true;
+                break;
+            }
+        }
+     }
+}
+
+void    TaskScheduler::WaitforAllAndShutdown()
+{
+    WaitforAll();
+    StopThreads(true);
+	delete[] m_pPipesPerThread;
+    m_pPipesPerThread = 0;
+}
+
+uint32_t        TaskScheduler::GetNumTaskThreads() const
+{
+    return m_NumThreads;
+}
+
+TaskScheduler::TaskScheduler()
+		: m_pPipesPerThread(NULL)
+		, m_NumThreads(0)
+		, m_pThreadNumStore(NULL)
+		, m_pThreadIDs(NULL)
+		, m_bRunning(false)
+		, m_NumThreadsRunning(0)
+		, m_NumThreadsWaiting(0)
+		, m_NumPartitions(0)
+		, m_bHaveThreads(false)
+{
+	memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks));
+}
+
+TaskScheduler::~TaskScheduler()
+{
+    StopThreads( true ); // Stops threads, waiting for them.
+
+    delete[] m_pPipesPerThread;
+    m_pPipesPerThread = 0;
+}
+
+void    TaskScheduler::Initialize( uint32_t numThreads_ )
+{
+	assert( numThreads_ );
+    StopThreads( true ); // Stops threads, waiting for them.
+    delete[] m_pPipesPerThread;
+
+	m_NumThreads = numThreads_;
+
+    m_pPipesPerThread = new TaskPipe[ m_NumThreads ];
+
+    StartThreads();
+}
+
+void   TaskScheduler::Initialize()
+{
+	Initialize( GetNumHardwareThreads() );
+}
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h
@ -0,0 +1,177 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+#include "Threads.h"
+
+namespace enki
+{
+
+	struct TaskSetPartition
+	{
+		uint32_t start;
+		uint32_t end;
+	};
+
+	class  TaskScheduler;
+	class  TaskPipe;
+	struct ThreadArgs;
+	struct SubTaskSet;
+
+	// Subclass ITaskSet to create tasks.
+	// TaskSets can be re-used, but check
+	class ITaskSet
+	{
+	public:
+        ITaskSet()
+            : m_SetSize(1)
+			, m_MinRange(1)
+            , m_RunningCount(0)
+			, m_RangeToRun(1)
+        {}
+
+        ITaskSet( uint32_t setSize_ )
+            : m_SetSize( setSize_ )
+			, m_MinRange(1)
+            , m_RunningCount(0)
+			, m_RangeToRun(1)
+        {}
+
+		ITaskSet( uint32_t setSize_, uint32_t minRange_ )
+            : m_SetSize( setSize_ )
+			, m_MinRange( minRange_ )
+            , m_RunningCount(0)
+			, m_RangeToRun(minRange_)
+        {}
+
+		// Execute range should be overloaded to process tasks. It will be called with a
+		// range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize;
+		// The range values should be mapped so that linearly processing them in order is cache friendly
+		// i.e. neighbouring values should be close together.
+		// threadnum should not be used for changing processing of data, it's intended purpose
+		// is to allow per-thread data buckets for output.
+		virtual void            ExecuteRange( TaskSetPartition range, uint32_t threadnum  ) = 0;
+
+		// Size of set - usually the number of data items to be processed, see ExecuteRange. Defaults to 1
+		uint32_t                m_SetSize;
+
+		// Minimum size of of TaskSetPartition range when splitting a task set into partitions.
+		// This should be set to a value which results in computation effort of at least 10k
+		// clock cycles to minimize tast scheduler overhead.
+		// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
+		// of m_MinRange.
+		// Also known as grain size in literature.
+		uint32_t                m_MinRange;
+
+		bool                    GetIsComplete()
+		{
+			return 0 == m_RunningCount;
+		}
+	private:
+		friend class           TaskScheduler;
+		volatile int32_t        m_RunningCount;
+		uint32_t                m_RangeToRun;
+	};
+
+	// TaskScheduler implements several callbacks intended for profilers
+	typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ );
+	struct ProfilerCallbacks
+	{
+		ProfilerCallbackFunc threadStart;
+		ProfilerCallbackFunc threadStop;
+		ProfilerCallbackFunc waitStart;
+		ProfilerCallbackFunc waitStop;
+	};
+
+	class TaskScheduler
+	{
+	public:
+		TaskScheduler();
+		~TaskScheduler();
+
+		// Call either Initialize() or Initialize( numThreads_ ) before adding tasks.
+
+		// Initialize() will create GetNumHardwareThreads()-1 threads, which is
+		// sufficient to fill the system when including the main thread.
+		// Initialize can be called multiple times - it will wait for completion
+		// before re-initializing.
+		void			Initialize();
+
+		// Initialize( numThreads_ ) - numThreads_ (must be > 0)
+		// will create numThreads_-1 threads, as thread 0 is
+		// the thread on which the initialize was called.
+		void			Initialize( uint32_t numThreads_ );
+
+
+		// Adds the TaskSet to pipe and returns if the pipe is not full.
+		// If the pipe is full, pTaskSet is run.
+		// should only be called from main thread, or within a task
+		void            AddTaskSetToPipe( ITaskSet* pTaskSet );
+
+		// Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete();
+		// should only be called from thread which created the taskscheduler , or within a task
+		// if called with 0 it will try to run tasks, and return if none available.
+		void            WaitforTaskSet( const ITaskSet* pTaskSet );
+
+		// Waits for all task sets to complete - not guaranteed to work unless we know we
+		// are in a situation where tasks aren't being continuosly added.
+		void            WaitforAll();
+
+		// Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we
+		// are in a situation where tasks aren't being continuosly added.
+		void            WaitforAllAndShutdown();
+
+		// Returns the number of threads created for running tasks + 1
+		// to account for the main thread.
+		uint32_t        GetNumTaskThreads() const;
+
+		// Returns the ProfilerCallbacks structure so that it can be modified to
+		// set the callbacks.
+		ProfilerCallbacks* GetProfilerCallbacks();
+
+	private:
+		static THREADFUNC_DECL  TaskingThreadFunction( void* pArgs );
+        void             WaitForTasks( uint32_t threadNum );
+		bool             TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ );
+		void             StartThreads();
+		void             StopThreads( bool bWait_ );
+		void             SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
+										  uint32_t rangeToSplit_, int32_t runningCountOffset_ );
+		void             WakeThreads();
+
+		TaskPipe*                                                m_pPipesPerThread;
+
+		uint32_t                                                 m_NumThreads;
+		ThreadArgs*                                              m_pThreadNumStore;
+		threadid_t*                                              m_pThreadIDs;
+		volatile bool                                            m_bRunning;
+		volatile int32_t                                         m_NumThreadsRunning;
+		volatile int32_t                                         m_NumThreadsWaiting;
+		uint32_t                                                 m_NumPartitions;
+		uint32_t                                                 m_NumInitialPartitions;
+		semaphoreid_t                                            m_NewTaskSemaphore;
+		bool                                                     m_bHaveThreads;
+		ProfilerCallbacks										 m_ProfilerCallbacks;
+
+		TaskScheduler( const TaskScheduler& nocopy );
+		TaskScheduler& operator=( const TaskScheduler& nocopy );
+	};
+
+}
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp
@ -0,0 +1,122 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#include "TaskScheduler_c.h"
+#include "TaskScheduler.h"
+
+#include <assert.h>
+
+using namespace enki;
+
+struct enkiTaskScheduler : TaskScheduler
+{
+};
+
+struct enkiTaskSet : ITaskSet
+{
+	enkiTaskSet( enkiTaskExecuteRange taskFun_ ) : taskFun(taskFun_), pArgs(NULL) {}
+
+	virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum  )
+	{
+		taskFun( range.start, range.end, threadnum, pArgs );
+	}
+
+	enkiTaskExecuteRange taskFun;
+	void* pArgs;
+};
+
+enkiTaskScheduler*	enkiNewTaskScheduler()
+{
+	enkiTaskScheduler* pETS = new enkiTaskScheduler();
+    return pETS;
+}
+
+void	            enkiInitTaskScheduler(  enkiTaskScheduler* pETS_ )
+{
+    pETS_->Initialize();
+}
+
+void	            enkiInitTaskSchedulerNumThreads(  enkiTaskScheduler* pETS_, uint32_t numThreads_ )
+{
+    pETS_->Initialize( numThreads_ );
+}
+
+void				enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ )
+{
+	delete pETS_;
+}
+
+enkiTaskSet*		enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_  )
+{
+	return new enkiTaskSet( taskFunc_ );
+}
+
+void                enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ )
+{
+	delete pTaskSet_;
+}
+
+void				enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, void* pArgs_, uint32_t setSize_ )
+{
+	assert( pTaskSet_ );
+	assert( pTaskSet_->taskFun );
+
+	pTaskSet_->m_SetSize = setSize_;
+	pTaskSet_->pArgs = pArgs_;
+	pETS_->AddTaskSetToPipe( pTaskSet_ );
+}
+
+void enkiAddTaskSetToPipeMinRange(enkiTaskScheduler * pETS_, enkiTaskSet * pTaskSet_, void * pArgs_, uint32_t setSize_, uint32_t minRange_)
+{
+	assert( pTaskSet_ );
+	assert( pTaskSet_->taskFun );
+
+	pTaskSet_->m_SetSize = setSize_;
+	pTaskSet_->m_MinRange = minRange_;
+	pTaskSet_->pArgs = pArgs_;
+	pETS_->AddTaskSetToPipe( pTaskSet_ );
+}
+
+int				enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
+{
+	assert( pTaskSet_ );
+	return ( pTaskSet_->GetIsComplete() ) ? 1 : 0;
+}
+
+void				enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ )
+{
+	pETS_->WaitforTaskSet( pTaskSet_ );
+}
+
+void				enkiWaitForAll( enkiTaskScheduler* pETS_ )
+{
+	pETS_->WaitforAll();
+}
+
+
+uint32_t			enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ )
+{
+	return pETS_->GetNumTaskThreads();
+}
+
+enkiProfilerCallbacks*	enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ )
+{
+    assert( sizeof(enkiProfilerCallbacks) == sizeof(enki::ProfilerCallbacks) );
+    return (enkiProfilerCallbacks*)pETS_->GetProfilerCallbacks();
+}
+
--- a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h
+++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h
@ -0,0 +1,104 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+typedef struct enkiTaskScheduler enkiTaskScheduler;
+typedef struct enkiTaskSet		 enkiTaskSet;
+
+typedef void (* enkiTaskExecuteRange)( uint32_t start_, uint32_t end, uint32_t threadnum_, void* pArgs_ );
+
+
+// Create a new task scheduler
+enkiTaskScheduler*	enkiNewTaskScheduler();
+
+// Initialize task scheduler - will create GetNumHardwareThreads()-1 threads, which is
+// sufficient to fill the system when including the main thread.
+// Initialize can be called multiple times - it will wait for completion
+// before re-initializing.
+void	            enkiInitTaskScheduler(  enkiTaskScheduler* pETS_ );
+
+// Initialize a task scheduler with numThreads_ (must be > 0)
+// will create numThreads_-1 threads, as thread 0 is
+// the thread on which the initialize was called.
+void	            enkiInitTaskSchedulerNumThreads(  enkiTaskScheduler* pETS_, uint32_t numThreads_ );
+
+
+// Delete a task scheduler
+void				enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ );
+
+// Create a task set.
+enkiTaskSet*		enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_  );
+
+// Delete a task set.
+void                enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ );
+
+// Schedule the task
+void				enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
+										   void* pArgs_, uint32_t setSize_ );
+
+// Schedule the task with a minimum range.
+// This should be set to a value which results in computation effort of at least 10k
+// clock cycles to minimize tast scheduler overhead.
+// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple
+// of m_MinRange.
+// Also known as grain size in literature.
+void				enkiAddTaskSetToPipeMinRange( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_,
+												  void* pArgs_, uint32_t setSize_, uint32_t minRange_ );
+
+
+// Check if TaskSet is complete. Doesn't wait. Returns 1 if complete, 0 if not.
+int					enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
+
+
+// Wait for a given task.
+// should only be called from thread which created the taskscheduler , or within a task
+// if called with 0 it will try to run tasks, and return if none available.
+void				enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ );
+
+
+// Waits for all task sets to complete - not guaranteed to work unless we know we
+// are in a situation where tasks aren't being continuosly added.
+void				enkiWaitForAll( enkiTaskScheduler* pETS_ );
+
+
+// get number of threads
+uint32_t			enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ );
+
+// TaskScheduler implements several callbacks intended for profilers
+typedef void (*enkiProfilerCallbackFunc)( uint32_t threadnum_ );
+struct enkiProfilerCallbacks
+{
+    enkiProfilerCallbackFunc threadStart;
+    enkiProfilerCallbackFunc threadStop;
+    enkiProfilerCallbackFunc waitStart;
+    enkiProfilerCallbackFunc waitStop;
+};
+
+// Get the callback structure so it can be set 
+struct enkiProfilerCallbacks*	enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ );
+
+#ifdef __cplusplus
+}
+#endif
--- a/examples/ToyPathTracer/Source/enkiTS/Threads.h
+++ b/examples/ToyPathTracer/Source/enkiTS/Threads.h
@ -0,0 +1,210 @@
+// Copyright (c) 2013 Doug Binks
+// 
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+// 
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+// 
+// 1. The origin of this software must not be misrepresented; you must not
+//    claim that you wrote the original software. If you use this software
+//    in a product, an acknowledgement in the product documentation would be
+//    appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//    misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+
+#pragma once
+
+#include <stdint.h>
+#include <assert.h>
+
+#ifdef _WIN32
+
+	#include "Atomics.h"
+
+	#define WIN32_LEAN_AND_MEAN
+	#include <Windows.h>
+	
+	#define THREADFUNC_DECL DWORD WINAPI
+	#define THREAD_LOCAL __declspec( thread )
+
+namespace enki
+{
+    typedef HANDLE threadid_t;
+
+    // declare the thread start function as:
+    // THREADFUNC_DECL MyThreadStart( void* pArg );
+    inline bool ThreadCreate( threadid_t* returnid, DWORD ( WINAPI *StartFunc) (void* ), void* pArg )
+    {
+        // posix equiv pthread_create
+        DWORD threadid;
+        *returnid = CreateThread( 0, 0, StartFunc, pArg, 0, &threadid );
+        return  *returnid != NULL;
+    }
+
+    inline bool ThreadTerminate( threadid_t threadid )
+    {
+        // posix equiv pthread_cancel
+        return CloseHandle( threadid ) == 0;
+    }
+
+    inline uint32_t GetNumHardwareThreads()
+    {
+        SYSTEM_INFO sysInfo;
+        GetSystemInfo(&sysInfo);
+        return sysInfo.dwNumberOfProcessors;
+    }
+}
+
+#else // posix
+
+	#include <pthread.h>
+	#include <unistd.h>
+	#define THREADFUNC_DECL void*
+	#define THREAD_LOCAL __thread
+
+namespace enki
+{
+    typedef pthread_t threadid_t;  
+        
+    // declare the thread start function as:
+    // THREADFUNC_DECL MyThreadStart( void* pArg );
+    inline bool ThreadCreate( threadid_t* returnid, void* ( *StartFunc) (void* ), void* pArg )
+    {
+        // posix equiv pthread_create
+        int32_t retval = pthread_create( returnid, NULL, StartFunc, pArg );
+
+        return  retval == 0;
+    }
+    
+    inline bool ThreadTerminate( threadid_t threadid )
+    {
+        // posix equiv pthread_cancel
+        return pthread_cancel( threadid ) == 0;
+    }
+    
+    inline uint32_t GetNumHardwareThreads()
+    {
+        return (uint32_t)sysconf( _SC_NPROCESSORS_ONLN );
+    }
+}
+
+#endif // posix
+
+
+// Semaphore implementation
+#ifdef _WIN32
+
+namespace enki
+{
+    struct semaphoreid_t
+    {
+        HANDLE      sem;
+    };
+	
+	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
+    {
+        semaphoreid.sem = CreateSemaphore(NULL, 0, MAXLONG, NULL );
+    }
+
+    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
+    {
+        CloseHandle( semaphoreid.sem );
+    }
+
+    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
+    {
+        DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE );
+
+        assert( retval != WAIT_FAILED );
+    }
+
+    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
+    {
+		if( countWaiting )
+		{
+			ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL );
+		}
+    }
+}
+#elif defined(__MACH__)
+
+// OS X does not have POSIX semaphores
+// see https://developer.apple.com/library/content/documentation/Darwin/Conceptual/KernelProgramming/synchronization/synchronization.html
+#include <mach/mach.h>
+
+namespace enki
+{
+    
+    struct semaphoreid_t
+    {
+        semaphore_t   sem;
+    };
+	
+	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
+    {
+		semaphore_create( mach_task_self(), &semaphoreid.sem, SYNC_POLICY_FIFO, 0 );
+    }
+    
+    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
+    {
+        semaphore_destroy( mach_task_self(), semaphoreid.sem );
+    }
+    
+    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
+    {
+        semaphore_wait( semaphoreid.sem );
+    }
+    
+    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
+    {
+        while( countWaiting-- > 0 )
+		{
+			semaphore_signal( semaphoreid.sem );
+		}
+    }
+}
+
+#else // POSIX
+
+#include <semaphore.h>
+
+namespace enki
+{
+    
+    struct semaphoreid_t
+    {
+        sem_t   sem;
+    };
+	
+	inline void SemaphoreCreate( semaphoreid_t& semaphoreid )
+    {
+		int err = sem_init( &semaphoreid.sem, 0, 0 );
+		assert( err == 0 );
+    }
+    
+    inline void SemaphoreClose( semaphoreid_t& semaphoreid )
+    {
+        sem_destroy( &semaphoreid.sem );
+    }
+    
+    inline void SemaphoreWait( semaphoreid_t& semaphoreid  )
+    {
+        int err = sem_wait( &semaphoreid.sem );
+		assert( err == 0 );
+    }
+    
+    inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting )
+    {
+        while( countWaiting-- > 0 )
+		{
+			sem_post( &semaphoreid.sem );
+		}
+    }
+}
+#endif
+
+
--- a/examples/ToyPathTracer/Windows/ComputeShader.hlsl
+++ b/examples/ToyPathTracer/Windows/ComputeShader.hlsl
@ -0,0 +1,395 @@
+#include "../Source/Config.h"
+
+inline uint RNG(inout uint state)
+{
+    uint x = state;
+    x ^= x << 13;
+    x ^= x >> 17;
+    x ^= x << 15;
+    state = x;
+    return x;
+}
+
+float RandomFloat01(inout uint state)
+{
+    return (RNG(state) & 0xFFFFFF) / 16777216.0f;
+}
+
+float3 RandomInUnitDisk(inout uint state)
+{
+    float a = RandomFloat01(state) * 2.0f * 3.1415926f;
+    float2 xy = float2(cos(a), sin(a));
+    xy *= sqrt(RandomFloat01(state));
+    return float3(xy, 0);
+}
+float3 RandomInUnitSphere(inout uint state)
+{
+    float z = RandomFloat01(state) * 2.0f - 1.0f;
+    float t = RandomFloat01(state) * 2.0f * 3.1415926f;
+    float r = sqrt(max(0.0, 1.0f - z * z));
+    float x = r * cos(t);
+    float y = r * sin(t);
+    float3 res = float3(x, y, z);
+    res *= pow(RandomFloat01(state), 1.0 / 3.0);
+    return res;
+}
+float3 RandomUnitVector(inout uint state)
+{
+    float z = RandomFloat01(state) * 2.0f - 1.0f;
+    float a = RandomFloat01(state) * 2.0f * 3.1415926f;
+    float r = sqrt(1.0f - z * z);
+    float x = r * cos(a);
+    float y = r * sin(a);
+    return float3(x, y, z);
+}
+
+
+
+struct Ray
+{
+    float3 orig;
+    float3 dir;
+};
+Ray MakeRay(float3 orig_, float3 dir_) { Ray r; r.orig = orig_; r.dir = dir_; return r; }
+float3 RayPointAt(Ray r, float t) { return r.orig + r.dir * t; }
+
+
+inline bool refract(float3 v, float3 n, float nint, out float3 outRefracted)
+{
+    float dt = dot(v, n);
+    float discr = 1.0f - nint * nint*(1 - dt * dt);
+    if (discr > 0)
+    {
+        outRefracted = nint * (v - n * dt) - n * sqrt(discr);
+        return true;
+    }
+    return false;
+}
+inline float schlick(float cosine, float ri)
+{
+    float r0 = (1 - ri) / (1 + ri);
+    r0 = r0 * r0;
+    // note: saturate to guard against possible tiny negative numbers
+    return r0 + (1 - r0)*pow(saturate(1 - cosine), 5);
+}
+
+struct Hit
+{
+    float3 pos;
+    float3 normal;
+    float t;
+};
+
+struct Sphere
+{
+    float3 center;
+    float radius;
+    float invRadius;
+};
+
+#define MatLambert 0
+#define MatMetal 1
+#define MatDielectric 2
+
+struct Material
+{
+    int type;
+    float3 albedo;
+    float3 emissive;
+    float roughness;
+    float ri;
+};
+
+groupshared Sphere s_GroupSpheres[kCSMaxObjects];
+groupshared Material s_GroupMaterials[kCSMaxObjects];
+groupshared int s_GroupEmissives[kCSMaxObjects];
+
+
+struct Camera
+{
+    float3 origin;
+    float3 lowerLeftCorner;
+    float3 horizontal;
+    float3 vertical;
+    float3 u, v, w;
+    float lensRadius;
+};
+
+Ray CameraGetRay(Camera cam, float s, float t, inout uint state)
+{
+    float3 rd = cam.lensRadius * RandomInUnitDisk(state);
+    float3 offset = cam.u * rd.x + cam.v * rd.y;
+    return MakeRay(cam.origin + offset, normalize(cam.lowerLeftCorner + s * cam.horizontal + t * cam.vertical - cam.origin - offset));
+}
+
+
+int HitSpheres(Ray r, int sphereCount, float tMin, float tMax, inout Hit outHit)
+{
+    float hitT = tMax;
+    int id = -1;
+    for (int i = 0; i < sphereCount; ++i)
+    {
+        Sphere s = s_GroupSpheres[i];
+        float3 co = s.center - r.orig;
+        float nb = dot(co, r.dir);
+        float c = dot(co, co) - s.radius*s.radius;
+        float discr = nb * nb - c;
+        if (discr > 0)
+        {
+            float discrSq = sqrt(discr);
+
+            // Try earlier t
+            float t = nb - discrSq;
+            if (t <= tMin) // before min, try later t!
+                t = nb + discrSq;
+
+            if (t > tMin && t < hitT)
+            {
+                id = i;
+                hitT = t;
+            }
+        }
+    }
+
+    if (id != -1)
+    {
+        outHit.pos = RayPointAt(r, hitT);
+        outHit.normal = (outHit.pos - s_GroupSpheres[id].center) * s_GroupSpheres[id].invRadius;
+        outHit.t = hitT;
+    }
+    return id;
+}
+
+struct Params
+{
+    Camera cam;
+    int sphereCount;
+    int screenWidth;
+    int screenHeight;
+    int frames;
+    float invWidth;
+    float invHeight;
+    float lerpFac;
+    int emissiveCount;
+};
+
+
+#define kMinT 0.001f
+#define kMaxT 1.0e7f
+#define kMaxDepth 10
+
+
+static int HitWorld(int sphereCount, Ray r, float tMin, float tMax, inout Hit outHit)
+{
+    return HitSpheres(r, sphereCount, tMin, tMax, outHit);
+}
+
+
+static bool Scatter(int sphereCount, int emissiveCount, int matID, Ray r_in, Hit rec, out float3 attenuation, out Ray scattered, out float3 outLightE, inout int inoutRayCount, inout uint state)
+{
+    outLightE = float3(0, 0, 0);
+    Material mat = s_GroupMaterials[matID];
+    if (mat.type == MatLambert)
+    {
+        // random point on unit sphere that is tangent to the hit point
+        float3 target = rec.pos + rec.normal + RandomUnitVector(state);
+        scattered = MakeRay(rec.pos, normalize(target - rec.pos));
+        attenuation = mat.albedo;
+
+        // sample lights
+#if DO_LIGHT_SAMPLING
+        for (int j = 0; j < emissiveCount; ++j)
+        {
+            int i = s_GroupEmissives[j];
+            if (matID == i)
+                continue; // skip self
+            Material smat = s_GroupMaterials[i];
+            Sphere s = s_GroupSpheres[i];
+
+            // create a random direction towards sphere
+            // coord system for sampling: sw, su, sv
+            float3 sw = normalize(s.center - rec.pos);
+            float3 su = normalize(cross(abs(sw.x)>0.01f ? float3(0, 1, 0) : float3(1, 0, 0), sw));
+            float3 sv = cross(sw, su);
+            // sample sphere by solid angle
+            float cosAMax = sqrt(1.0f - s.radius*s.radius / dot(rec.pos - s.center, rec.pos - s.center));
+            float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state);
+            float cosA = 1.0f - eps1 + eps1 * cosAMax;
+            float sinA = sqrt(1.0f - cosA * cosA);
+            float phi = 2 * 3.1415926 * eps2;
+            float3 l = su * cos(phi) * sinA + sv * sin(phi) * sinA + sw * cosA;
+
+            // shoot shadow ray
+            Hit lightHit;
+            ++inoutRayCount;
+            int hitID = HitWorld(sphereCount, MakeRay(rec.pos, l), kMinT, kMaxT, lightHit);
+            if (hitID == i)
+            {
+                float omega = 2 * 3.1415926 * (1 - cosAMax);
+
+                float3 rdir = r_in.dir;
+                float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal;
+                outLightE += (mat.albedo * smat.emissive) * (max(0.0f, dot(l, nl)) * omega / 3.1415926);
+            }
+        }
+#endif
+        return true;
+    }
+    else if (mat.type == MatMetal)
+    {
+        float3 refl = reflect(r_in.dir, rec.normal);
+        // reflected ray, and random inside of sphere based on roughness
+        float roughness = mat.roughness;
+#if DO_MITSUBA_COMPARE
+        roughness = 0; // until we get better BRDF for metals
+#endif
+        scattered = MakeRay(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state)));
+        attenuation = mat.albedo;
+        return dot(scattered.dir, rec.normal) > 0;
+    }
+    else if (mat.type == MatDielectric)
+    {
+        float3 outwardN;
+        float3 rdir = r_in.dir;
+        float3 refl = reflect(rdir, rec.normal);
+        float nint;
+        attenuation = float3(1, 1, 1);
+        float3 refr;
+        float reflProb;
+        float cosine;
+        if (dot(rdir, rec.normal) > 0)
+        {
+            outwardN = -rec.normal;
+            nint = mat.ri;
+            cosine = mat.ri * dot(rdir, rec.normal);
+        }
+        else
+        {
+            outwardN = rec.normal;
+            nint = 1.0f / mat.ri;
+            cosine = -dot(rdir, rec.normal);
+        }
+        if (refract(rdir, outwardN, nint, refr))
+        {
+            reflProb = schlick(cosine, mat.ri);
+        }
+        else
+        {
+            reflProb = 1;
+        }
+        if (RandomFloat01(state) < reflProb)
+            scattered = MakeRay(rec.pos, normalize(refl));
+        else
+            scattered = MakeRay(rec.pos, normalize(refr));
+    }
+    else
+    {
+        attenuation = float3(1, 0, 1);
+        scattered = MakeRay(float3(0,0,0), float3(0, 0, 1));
+        return false;
+    }
+    return true;
+}
+
+static float3 Trace(int sphereCount, int emissiveCount, Ray r, inout int inoutRayCount, inout uint state)
+{
+    float3 col = 0;
+    float3 curAtten = 1;
+    bool doMaterialE = true;
+    // GPUs don't support recursion, so do tracing iterations in a loop up to max depth
+    for (int depth = 0; depth < kMaxDepth; ++depth)
+    {
+        Hit rec;
+        ++inoutRayCount;
+        int id = HitWorld(sphereCount, r, kMinT, kMaxT, rec);
+        if (id >= 0)
+        {
+            Ray scattered;
+            float3 attenuation;
+            float3 lightE;
+            Material mat = s_GroupMaterials[id];
+            float3 matE = mat.emissive;
+            if (Scatter(sphereCount, emissiveCount, id, r, rec, attenuation, scattered, lightE, inoutRayCount, state))
+            {
+#if DO_LIGHT_SAMPLING
+                if (!doMaterialE) matE = 0;
+                doMaterialE = (mat.type != MatLambert);
+#endif
+                col += curAtten * (matE + lightE);
+                curAtten *= attenuation;
+                r = scattered;
+            }
+            else
+            {
+                col += curAtten * matE;
+                break;
+            }
+        }
+        else
+        {
+            // sky
+#if DO_MITSUBA_COMPARE
+            col += curAtten * float3(0.15f, 0.21f, 0.3f); // easier compare with Mitsuba's constant environment light
+#else
+            float3 unitDir = r.dir;
+            float t = 0.5f*(unitDir.y + 1.0f);
+            float3 skyCol = ((1.0f - t)*float3(1.0f, 1.0f, 1.0f) + t * float3(0.5f, 0.7f, 1.0f)) * 0.3f;
+            col += curAtten * skyCol;
+#endif
+            break;
+        }
+    }
+    return col;
+}
+
+Texture2D srcImage : register(t0);
+RWTexture2D<float4> dstImage : register(u0);
+StructuredBuffer<Sphere> g_Spheres : register(t1);
+StructuredBuffer<Material> g_Materials : register(t2);
+StructuredBuffer<Params> g_Params : register(t3);
+StructuredBuffer<int> g_Emissives : register(t4);
+RWByteAddressBuffer g_OutRayCount : register(u1);
+
+[numthreads(kCSGroupSizeX, kCSGroupSizeY, 1)]
+void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID)
+{
+    // First, move scene data (spheres, materials, emissive indices) into group shared
+    // memory. Do this in parallel; each thread in group copies its own chunk of data.
+    uint threadID = tid.y * kCSGroupSizeX + tid.x;
+    uint groupSize = kCSGroupSizeX * kCSGroupSizeY;
+    uint objCount = g_Params[0].sphereCount;
+    uint myObjCount = (objCount + groupSize - 1) / groupSize;
+    uint myObjStart = threadID * myObjCount;
+    for (uint io = myObjStart; io < myObjStart + myObjCount; ++io)
+    {
+        if (io < objCount)
+        {
+            s_GroupSpheres[io] = g_Spheres[io];
+            s_GroupMaterials[io] = g_Materials[io];
+        }
+        if (io < g_Params[0].emissiveCount)
+        {
+            s_GroupEmissives[io] = g_Emissives[io];
+        }
+    }
+    GroupMemoryBarrierWithGroupSync();
+
+    int rayCount = 0;
+    float3 col = 0;
+    Params params = g_Params[0];
+    uint rngState = (gid.x * 1973 + gid.y * 9277 + params.frames * 26699) | 1;
+    for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++)
+    {
+        float u = float(gid.x + RandomFloat01(rngState)) * params.invWidth;
+        float v = float(gid.y + RandomFloat01(rngState)) * params.invHeight;
+        Ray r = CameraGetRay(params.cam, u, v, rngState);
+        col += Trace(params.sphereCount, params.emissiveCount, r, rayCount, rngState);
+    }
+    col *= 1.0f / float(DO_SAMPLES_PER_PIXEL);
+
+    float3 prev = srcImage.Load(int3(gid.xy,0)).rgb;
+    col = lerp(col, prev, params.lerpFac);
+    dstImage[gid.xy] = float4(col, 1);
+
+    g_OutRayCount.InterlockedAdd(0, rayCount);
+}
--- a/examples/ToyPathTracer/Windows/PixelShader.hlsl
+++ b/examples/ToyPathTracer/Windows/PixelShader.hlsl
@ -0,0 +1,15 @@
+float3 LinearToSRGB(float3 rgb)
+{
+    rgb = max(rgb, float3(0, 0, 0));
+    return max(1.055 * pow(rgb, 0.416666667) - 0.055, 0.0);
+}
+
+Texture2D tex : register(t0);
+SamplerState smp : register(s0);
+
+float4 main(float2 uv : TEXCOORD0) : SV_Target
+{
+    float3 col = tex.Sample(smp, uv).rgb;
+    col = LinearToSRGB(col);
+    return float4(col, 1.0f);
+}
--- a/examples/ToyPathTracer/Windows/TestCpu.sln
+++ b/examples/ToyPathTracer/Windows/TestCpu.sln
@ -0,0 +1,31 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.27130.2036
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TestCpu", "TestCpu.vcxproj", "{4F84B756-87F5-4B92-827B-DA087DAE1900}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x64.ActiveCfg = Debug|x64
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x64.Build.0 = Debug|x64
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x86.ActiveCfg = Debug|Win32
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x86.Build.0 = Debug|Win32
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x64.ActiveCfg = Release|x64
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x64.Build.0 = Release|x64
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x86.ActiveCfg = Release|Win32
+		{4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {067FB780-37B8-465E-AD7E-E7B238B9C04F}
+	EndGlobalSection
+EndGlobal
--- a/examples/ToyPathTracer/Windows/TestCpu.vcxproj
+++ b/examples/ToyPathTracer/Windows/TestCpu.vcxproj
@ -0,0 +1,242 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{4F84B756-87F5-4B92-827B-DA087DAE1900}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>TestCpu</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <CallingConvention>VectorCall</CallingConvention>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <CallingConvention>VectorCall</CallingConvention>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <ExceptionHandling>false</ExceptionHandling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <CallingConvention>VectorCall</CallingConvention>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <ExceptionHandling>false</ExceptionHandling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <CallingConvention>VectorCall</CallingConvention>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\Source\enkiTS\TaskScheduler.cpp" />
+    <ClCompile Include="..\Source\enkiTS\TaskScheduler_c.cpp" />
+    <ClCompile Include="..\Source\Maths.cpp" />
+    <ClCompile Include="..\Source\Test.cpp" />
+    <ClCompile Include="TestWin.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\Source\Config.h" />
+    <ClInclude Include="..\Source\enkiTS\Atomics.h" />
+    <ClInclude Include="..\Source\enkiTS\LockLessMultiReadPipe.h" />
+    <ClInclude Include="..\Source\enkiTS\TaskScheduler.h" />
+    <ClInclude Include="..\Source\enkiTS\TaskScheduler_c.h" />
+    <ClInclude Include="..\Source\enkiTS\Threads.h" />
+    <ClInclude Include="..\Source\Maths.h" />
+    <ClInclude Include="..\Source\MathSimd.h" />
+    <ClInclude Include="..\Source\Test.h" />
+    <ClInclude Include="..\Source\stb_image.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\.editorconfig" />
+  </ItemGroup>
+  <ItemGroup>
+    <FxCompile Include="ComputeShader.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">5.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">5.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">5.0</ShaderModel>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Compute</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">5.0</ShaderModel>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">g_CSBytecode</VariableName>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompiledComputeShader.h</HeaderFileOutput>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">g_CSBytecode</VariableName>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompiledComputeShader.h</HeaderFileOutput>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">g_CSBytecode</VariableName>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompiledComputeShader.h</HeaderFileOutput>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">g_CSBytecode</VariableName>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompiledComputeShader.h</HeaderFileOutput>
+    </FxCompile>
+    <FxCompile Include="PixelShader.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Pixel</ShaderType>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Pixel</ShaderType>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Pixel</ShaderType>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Pixel</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">5.0</ShaderModel>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">5.0</ShaderModel>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">5.0</ShaderModel>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">5.0</ShaderModel>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompiledPixelShader.h</HeaderFileOutput>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompiledPixelShader.h</HeaderFileOutput>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompiledPixelShader.h</HeaderFileOutput>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompiledPixelShader.h</HeaderFileOutput>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">g_PSBytecode</VariableName>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">g_PSBytecode</VariableName>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">g_PSBytecode</VariableName>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">g_PSBytecode</VariableName>
+    </FxCompile>
+    <FxCompile Include="VertexShader.hlsl">
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Vertex</ShaderType>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Vertex</ShaderType>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Vertex</ShaderType>
+      <ShaderType Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Vertex</ShaderType>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">5.0</ShaderModel>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">5.0</ShaderModel>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">5.0</ShaderModel>
+      <ShaderModel Condition="'$(Configuration)|$(Platform)'=='Release|x64'">5.0</ShaderModel>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CompiledVertexShader.h</HeaderFileOutput>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CompiledVertexShader.h</HeaderFileOutput>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CompiledVertexShader.h</HeaderFileOutput>
+      <HeaderFileOutput Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CompiledVertexShader.h</HeaderFileOutput>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">g_VSBytecode</VariableName>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">g_VSBytecode</VariableName>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">g_VSBytecode</VariableName>
+      <VariableName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">g_VSBytecode</VariableName>
+    </FxCompile>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters
+++ b/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters
@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="TestWin.cpp" />
+    <ClCompile Include="..\Source\Test.cpp">
+      <Filter>Source</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Source\enkiTS\TaskScheduler.cpp">
+      <Filter>Source\enkiTS</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Source\enkiTS\TaskScheduler_c.cpp">
+      <Filter>Source\enkiTS</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Source\Maths.cpp">
+      <Filter>Source</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Source">
+      <UniqueIdentifier>{5f19f217-c1c7-4eeb-be61-8b986fee9375}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source\enkiTS">
+      <UniqueIdentifier>{38c448a8-1dcc-4116-9410-a9f8d068caff}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\Source\Test.h">
+      <Filter>Source</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\stb_image.h">
+      <Filter>Source</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\enkiTS\Atomics.h">
+      <Filter>Source\enkiTS</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\enkiTS\LockLessMultiReadPipe.h">
+      <Filter>Source\enkiTS</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\enkiTS\TaskScheduler.h">
+      <Filter>Source\enkiTS</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\enkiTS\TaskScheduler_c.h">
+      <Filter>Source\enkiTS</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\enkiTS\Threads.h">
+      <Filter>Source\enkiTS</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\Maths.h">
+      <Filter>Source</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\Config.h">
+      <Filter>Source</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Source\MathSimd.h">
+      <Filter>Source</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\.editorconfig" />
+  </ItemGroup>
+  <ItemGroup>
+    <FxCompile Include="VertexShader.hlsl" />
+    <FxCompile Include="PixelShader.hlsl" />
+    <FxCompile Include="ComputeShader.hlsl" />
+  </ItemGroup>
+</Project>
--- a/examples/ToyPathTracer/Windows/TestWin.cpp
+++ b/examples/ToyPathTracer/Windows/TestWin.cpp
@ -0,0 +1,540 @@
+#include <stdint.h>
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <d3d11_1.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <algorithm>
+
+#include "../Source/Config.h"
+#include "../Source/Maths.h"
+#include "../Source/Test.h"
+#include "CompiledVertexShader.h"
+#include "CompiledPixelShader.h"
+
+static HINSTANCE g_HInstance;
+static HWND g_Wnd;
+
+ATOM                MyRegisterClass(HINSTANCE hInstance);
+BOOL                InitInstance(HINSTANCE, int);
+LRESULT CALLBACK    WndProc(HWND, UINT, WPARAM, LPARAM);
+INT_PTR CALLBACK    About(HWND, UINT, WPARAM, LPARAM);
+
+static HRESULT InitD3DDevice();
+static void ShutdownD3DDevice();
+static void RenderFrame();
+
+static float* g_Backbuffer;
+
+static D3D_FEATURE_LEVEL g_D3D11FeatureLevel = D3D_FEATURE_LEVEL_11_0;
+static ID3D11Device* g_D3D11Device = nullptr;
+static ID3D11DeviceContext* g_D3D11Ctx = nullptr;
+static IDXGISwapChain* g_D3D11SwapChain = nullptr;
+static ID3D11RenderTargetView* g_D3D11RenderTarget = nullptr;
+static ID3D11VertexShader* g_VertexShader;
+static ID3D11PixelShader* g_PixelShader;
+static ID3D11Texture2D *g_BackbufferTexture, *g_BackbufferTexture2;
+static ID3D11ShaderResourceView *g_BackbufferSRV, *g_BackbufferSRV2;
+static ID3D11UnorderedAccessView *g_BackbufferUAV, *g_BackbufferUAV2;
+static ID3D11SamplerState* g_SamplerLinear;
+static ID3D11RasterizerState* g_RasterState;
+static int g_BackbufferIndex;
+
+
+#if DO_COMPUTE_GPU
+#include "CompiledComputeShader.h"
+struct ComputeParams
+{
+    Camera cam;
+    int sphereCount;
+    int screenWidth;
+    int screenHeight;
+    int frames;
+    float invWidth;
+    float invHeight;
+    float lerpFac;
+    int emissiveCount;
+};
+static ID3D11ComputeShader* g_ComputeShader;
+static ID3D11Buffer* g_DataSpheres;     static ID3D11ShaderResourceView* g_SRVSpheres;
+static ID3D11Buffer* g_DataMaterials;   static ID3D11ShaderResourceView* g_SRVMaterials;
+static ID3D11Buffer* g_DataParams;      static ID3D11ShaderResourceView* g_SRVParams;
+static ID3D11Buffer* g_DataEmissives;   static ID3D11ShaderResourceView* g_SRVEmissives;
+static ID3D11Buffer* g_DataCounter;     static ID3D11UnorderedAccessView* g_UAVCounter;
+static int g_SphereCount, g_ObjSize, g_MatSize;
+static ID3D11Query *g_QueryBegin, *g_QueryEnd, *g_QueryDisjoint;
+#endif // #if DO_COMPUTE_GPU
+
+int APIENTRY wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR, _In_ int nCmdShow)
+{
+    g_Backbuffer = new float[kBackbufferWidth * kBackbufferHeight * 4];
+    memset(g_Backbuffer, 0, kBackbufferWidth * kBackbufferHeight * 4 * sizeof(g_Backbuffer[0]));
+
+    InitializeTest();
+
+    MyRegisterClass(hInstance);
+    if (!InitInstance (hInstance, nCmdShow))
+    {
+        return FALSE;
+    }
+
+    if (FAILED(InitD3DDevice()))
+    {
+        ShutdownD3DDevice();
+        return 0;
+    }
+
+    g_D3D11Device->CreateVertexShader(g_VSBytecode, ARRAYSIZE(g_VSBytecode), NULL, &g_VertexShader);
+    g_D3D11Device->CreatePixelShader(g_PSBytecode, ARRAYSIZE(g_PSBytecode), NULL, &g_PixelShader);
+#if DO_COMPUTE_GPU
+    g_D3D11Device->CreateComputeShader(g_CSBytecode, ARRAYSIZE(g_CSBytecode), NULL, &g_ComputeShader);
+#endif
+
+    D3D11_TEXTURE2D_DESC texDesc = {};
+    texDesc.Width = kBackbufferWidth;
+    texDesc.Height = kBackbufferHeight;
+    texDesc.MipLevels = 1;
+    texDesc.ArraySize = 1;
+    texDesc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
+    texDesc.SampleDesc.Count = 1;
+    texDesc.SampleDesc.Quality = 0;
+#if DO_COMPUTE_GPU
+    texDesc.Usage = D3D11_USAGE_DEFAULT;
+    texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
+    texDesc.CPUAccessFlags = 0;
+#else
+    texDesc.Usage = D3D11_USAGE_DYNAMIC;
+    texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+    texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+#endif
+    texDesc.MiscFlags = 0;
+    g_D3D11Device->CreateTexture2D(&texDesc, NULL, &g_BackbufferTexture);
+    g_D3D11Device->CreateTexture2D(&texDesc, NULL, &g_BackbufferTexture2);
+
+    D3D11_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+    srvDesc.Format = texDesc.Format;
+    srvDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+    srvDesc.Texture2D.MipLevels = 1;
+    srvDesc.Texture2D.MostDetailedMip = 0;
+    g_D3D11Device->CreateShaderResourceView(g_BackbufferTexture, &srvDesc, &g_BackbufferSRV);
+    g_D3D11Device->CreateShaderResourceView(g_BackbufferTexture2, &srvDesc, &g_BackbufferSRV2);
+
+    D3D11_SAMPLER_DESC smpDesc = {};
+    smpDesc.Filter = D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT;
+    smpDesc.AddressU = smpDesc.AddressV = smpDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
+    g_D3D11Device->CreateSamplerState(&smpDesc, &g_SamplerLinear);
+
+    D3D11_RASTERIZER_DESC rasterDesc = {};
+    rasterDesc.FillMode = D3D11_FILL_SOLID;
+    rasterDesc.CullMode = D3D11_CULL_NONE;
+    g_D3D11Device->CreateRasterizerState(&rasterDesc, &g_RasterState);
+
+#if DO_COMPUTE_GPU
+    D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+
+    int camSize;
+    GetObjectCount(g_SphereCount, g_ObjSize, g_MatSize, camSize);
+    assert(g_ObjSize == 20);
+    assert(g_MatSize == 36);
+    assert(camSize == 88);
+    D3D11_BUFFER_DESC bdesc = {};
+    bdesc.ByteWidth = g_SphereCount * g_ObjSize;
+    bdesc.Usage = D3D11_USAGE_DEFAULT;
+    bdesc.BindFlags = D3D11_BIND_SHADER_RESOURCE;
+    bdesc.CPUAccessFlags = 0;
+    bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+    bdesc.StructureByteStride = g_ObjSize;
+    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataSpheres);
+    srvDesc.Format = DXGI_FORMAT_UNKNOWN;
+    srvDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+    srvDesc.Buffer.FirstElement = 0;
+    srvDesc.Buffer.NumElements = g_SphereCount;
+    g_D3D11Device->CreateShaderResourceView(g_DataSpheres, &srvDesc, &g_SRVSpheres);
+
+    bdesc.ByteWidth = g_SphereCount * g_MatSize;
+    bdesc.StructureByteStride = g_MatSize;
+    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataMaterials);
+    srvDesc.Buffer.NumElements = g_SphereCount;
+    g_D3D11Device->CreateShaderResourceView(g_DataMaterials, &srvDesc, &g_SRVMaterials);
+
+    bdesc.ByteWidth = sizeof(ComputeParams);
+    bdesc.StructureByteStride = sizeof(ComputeParams);
+    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataParams);
+    srvDesc.Buffer.NumElements = 1;
+    g_D3D11Device->CreateShaderResourceView(g_DataParams, &srvDesc, &g_SRVParams);
+
+    bdesc.ByteWidth = g_SphereCount * 4;
+    bdesc.StructureByteStride = 4;
+    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataEmissives);
+    srvDesc.Buffer.NumElements = g_SphereCount;
+    g_D3D11Device->CreateShaderResourceView(g_DataEmissives, &srvDesc, &g_SRVEmissives);
+
+    bdesc.ByteWidth = 4;
+    bdesc.BindFlags |= D3D11_BIND_UNORDERED_ACCESS;
+    bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
+    bdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+    g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataCounter);
+    uavDesc.Format = DXGI_FORMAT_R32_TYPELESS;
+    uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+    uavDesc.Buffer.FirstElement = 0;
+    uavDesc.Buffer.NumElements = 1;
+    uavDesc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
+    g_D3D11Device->CreateUnorderedAccessView(g_DataCounter, &uavDesc, &g_UAVCounter);
+
+    uavDesc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT;
+    uavDesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
+    uavDesc.Texture2D.MipSlice = 0;
+    g_D3D11Device->CreateUnorderedAccessView(g_BackbufferTexture, &uavDesc, &g_BackbufferUAV);
+    g_D3D11Device->CreateUnorderedAccessView(g_BackbufferTexture2, &uavDesc, &g_BackbufferUAV2);
+
+    D3D11_QUERY_DESC qDesc = {};
+    qDesc.Query = D3D11_QUERY_TIMESTAMP;
+    g_D3D11Device->CreateQuery(&qDesc, &g_QueryBegin);
+    g_D3D11Device->CreateQuery(&qDesc, &g_QueryEnd);
+    qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
+    g_D3D11Device->CreateQuery(&qDesc, &g_QueryDisjoint);
+#endif // #if DO_COMPUTE_GPU
+
+
+    // Main message loop
+    MSG msg = { 0 };
+    while (msg.message != WM_QUIT)
+    {
+        if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE))
+        {
+            TranslateMessage(&msg);
+            DispatchMessage(&msg);
+        }
+        else
+        {
+            RenderFrame();
+        }
+    }
+
+    ShutdownTest();
+    ShutdownD3DDevice();
+
+    return (int) msg.wParam;
+}
+
+
+ATOM MyRegisterClass(HINSTANCE hInstance)
+{
+    WNDCLASSEXW wcex;
+    memset(&wcex, 0, sizeof(wcex));
+    wcex.cbSize = sizeof(WNDCLASSEX);
+    wcex.style          = CS_HREDRAW | CS_VREDRAW;
+    wcex.lpfnWndProc    = WndProc;
+    wcex.cbClsExtra     = 0;
+    wcex.cbWndExtra     = 0;
+    wcex.hInstance      = hInstance;
+    wcex.hCursor        = LoadCursor(nullptr, IDC_ARROW);
+    wcex.hbrBackground  = (HBRUSH)(COLOR_WINDOW+1);
+    wcex.lpszClassName  = L"TestClass";
+    return RegisterClassExW(&wcex);
+}
+
+BOOL InitInstance(HINSTANCE hInstance, int nCmdShow)
+{
+    g_HInstance = hInstance;
+    RECT rc = { 0, 0, kBackbufferWidth, kBackbufferHeight };
+    DWORD style = WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_MINIMIZEBOX;
+    AdjustWindowRect(&rc, style, FALSE);
+    HWND hWnd = CreateWindowW(L"TestClass", L"Test", style, CW_USEDEFAULT, CW_USEDEFAULT, rc.right-rc.left, rc.bottom-rc.top, nullptr, nullptr, hInstance, nullptr);
+    if (!hWnd)
+        return FALSE;
+    g_Wnd = hWnd;
+    ShowWindow(hWnd, nCmdShow);
+    UpdateWindow(hWnd);
+    return TRUE;
+}
+
+static uint64_t s_Time;
+static int s_Count;
+static char s_Buffer[200];
+static unsigned s_Flags = kFlagProgressive;
+static int s_FrameCount = 0;
+
+
+static void RenderFrame()
+{
+    LARGE_INTEGER time1;
+
+#if DO_COMPUTE_GPU
+    QueryPerformanceCounter(&time1);
+    float t = float(clock()) / CLOCKS_PER_SEC;
+    UpdateTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, s_Flags);
+
+    g_BackbufferIndex = 1 - g_BackbufferIndex;
+    void* dataSpheres = alloca(g_SphereCount * g_ObjSize);
+    void* dataMaterials = alloca(g_SphereCount * g_MatSize);
+    void* dataEmissives = alloca(g_SphereCount * 4);
+    ComputeParams dataParams;
+    GetSceneDesc(dataSpheres, dataMaterials, &dataParams.cam, dataEmissives, &dataParams.emissiveCount);
+
+    dataParams.sphereCount = g_SphereCount;
+    dataParams.screenWidth = kBackbufferWidth;
+    dataParams.screenHeight = kBackbufferHeight;
+    dataParams.frames = s_FrameCount;
+    dataParams.invWidth = 1.0f / kBackbufferWidth;
+    dataParams.invHeight = 1.0f / kBackbufferHeight;
+    float lerpFac = float(s_FrameCount) / float(s_FrameCount + 1);
+    if (s_Flags & kFlagAnimate)
+        lerpFac *= DO_ANIMATE_SMOOTHING;
+    if (!(s_Flags & kFlagProgressive))
+        lerpFac = 0;
+    dataParams.lerpFac = lerpFac;
+
+    g_D3D11Ctx->UpdateSubresource(g_DataSpheres, 0, NULL, dataSpheres, 0, 0);
+    g_D3D11Ctx->UpdateSubresource(g_DataMaterials, 0, NULL, dataMaterials, 0, 0);
+    g_D3D11Ctx->UpdateSubresource(g_DataParams, 0, NULL, &dataParams, 0, 0);
+    g_D3D11Ctx->UpdateSubresource(g_DataEmissives, 0, NULL, dataEmissives, 0, 0);
+
+    ID3D11ShaderResourceView* srvs[] = {
+        g_BackbufferIndex == 0 ? g_BackbufferSRV2 : g_BackbufferSRV,
+        g_SRVSpheres,
+        g_SRVMaterials,
+        g_SRVParams,
+        g_SRVEmissives
+    };
+    g_D3D11Ctx->CSSetShaderResources(0, ARRAYSIZE(srvs), srvs);
+    ID3D11UnorderedAccessView* uavs[] = {
+        g_BackbufferIndex == 0 ? g_BackbufferUAV : g_BackbufferUAV2,
+        g_UAVCounter
+    };
+    g_D3D11Ctx->CSSetUnorderedAccessViews(0, ARRAYSIZE(uavs), uavs, NULL);
+    g_D3D11Ctx->CSSetShader(g_ComputeShader, NULL, 0);
+    g_D3D11Ctx->Begin(g_QueryDisjoint);
+    g_D3D11Ctx->End(g_QueryBegin);
+    g_D3D11Ctx->Dispatch(kBackbufferWidth/kCSGroupSizeX, kBackbufferHeight/kCSGroupSizeY, 1);
+    g_D3D11Ctx->End(g_QueryEnd);
+    uavs[0] = NULL;
+    g_D3D11Ctx->CSSetUnorderedAccessViews(0, ARRAYSIZE(uavs), uavs, NULL);
+    ++s_FrameCount;
+
+#else
+    QueryPerformanceCounter(&time1);
+    float t = float(clock()) / CLOCKS_PER_SEC;
+    static size_t s_RayCounter = 0;
+    int rayCount;
+    UpdateTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, s_Flags);
+    DrawTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, g_Backbuffer, rayCount, s_Flags);
+    s_FrameCount++;
+    s_RayCounter += rayCount;
+    LARGE_INTEGER time2;
+    QueryPerformanceCounter(&time2);
+    uint64_t dt = time2.QuadPart - time1.QuadPart;
+    ++s_Count;
+    s_Time += dt;
+    if (s_Count > 10)
+    {
+        LARGE_INTEGER frequency;
+        QueryPerformanceFrequency(&frequency);
+
+        double s = double(s_Time) / double(frequency.QuadPart) / s_Count;
+        sprintf_s(s_Buffer, sizeof(s_Buffer), "%.2fms (%.1f FPS) %.1fMrays/s %.2fMrays/frame frames %i\n", s * 1000.0f, 1.f / s, s_RayCounter / s_Count / s * 1.0e-6f, s_RayCounter / s_Count * 1.0e-6f, s_FrameCount);
+        SetWindowTextA(g_Wnd, s_Buffer);
+        OutputDebugStringA(s_Buffer);
+        s_Count = 0;
+        s_Time = 0;
+        s_RayCounter = 0;
+    }
+
+    D3D11_MAPPED_SUBRESOURCE mapped;
+    g_D3D11Ctx->Map(g_BackbufferTexture, 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped);
+    const uint8_t* src = (const uint8_t*)g_Backbuffer;
+    uint8_t* dst = (uint8_t*)mapped.pData;
+    for (int y = 0; y < kBackbufferHeight; ++y)
+    {
+        memcpy(dst, src, kBackbufferWidth * 16);
+        src += kBackbufferWidth * 16;
+        dst += mapped.RowPitch;
+    }
+    g_D3D11Ctx->Unmap(g_BackbufferTexture, 0);
+#endif
+
+    g_D3D11Ctx->VSSetShader(g_VertexShader, NULL, 0);
+    g_D3D11Ctx->PSSetShader(g_PixelShader, NULL, 0);
+    g_D3D11Ctx->PSSetShaderResources(0, 1, g_BackbufferIndex == 0 ? &g_BackbufferSRV : &g_BackbufferSRV2);
+    g_D3D11Ctx->PSSetSamplers(0, 1, &g_SamplerLinear);
+    g_D3D11Ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+    g_D3D11Ctx->RSSetState(g_RasterState);
+    g_D3D11Ctx->Draw(3, 0);
+    g_D3D11SwapChain->Present(0, 0);
+
+#if DO_COMPUTE_GPU
+    g_D3D11Ctx->End(g_QueryDisjoint);
+
+    // get GPU times
+    while (g_D3D11Ctx->GetData(g_QueryDisjoint, NULL, 0, 0) == S_FALSE) { Sleep(0); }
+    D3D10_QUERY_DATA_TIMESTAMP_DISJOINT tsDisjoint;
+    g_D3D11Ctx->GetData(g_QueryDisjoint, &tsDisjoint, sizeof(tsDisjoint), 0);
+    if (!tsDisjoint.Disjoint)
+    {
+        UINT64 tsBegin, tsEnd;
+        // Note: on some GPUs/drivers, even when the disjoint query above already said "yeah I have data",
+        // might still not return "I have data" for timestamp queries before it.
+        while (g_D3D11Ctx->GetData(g_QueryBegin, &tsBegin, sizeof(tsBegin), 0) == S_FALSE) { Sleep(0); }
+        while (g_D3D11Ctx->GetData(g_QueryEnd, &tsEnd, sizeof(tsEnd), 0) == S_FALSE) { Sleep(0); }
+
+        float s = float(tsEnd - tsBegin) / float(tsDisjoint.Frequency);
+
+        static uint64_t s_RayCounter;
+        D3D11_MAPPED_SUBRESOURCE mapped;
+        g_D3D11Ctx->Map(g_DataCounter, 0, D3D11_MAP_READ, 0, &mapped);
+        s_RayCounter += *(const int*)mapped.pData;
+        g_D3D11Ctx->Unmap(g_DataCounter, 0);
+        int zeroCount = 0;
+        g_D3D11Ctx->UpdateSubresource(g_DataCounter, 0, NULL, &zeroCount, 0, 0);
+
+        static float s_Time;
+        ++s_Count;
+        s_Time += s;
+        if (s_Count > 150)
+        {
+            s = s_Time / s_Count;
+            sprintf_s(s_Buffer, sizeof(s_Buffer), "%.2fms (%.1f FPS) %.1fMrays/s %.2fMrays/frame frames %i\n", s * 1000.0f, 1.f / s, s_RayCounter / s_Count / s * 1.0e-6f, s_RayCounter / s_Count * 1.0e-6f, s_FrameCount);
+            SetWindowTextA(g_Wnd, s_Buffer);
+            s_Count = 0;
+            s_Time = 0;
+            s_RayCounter = 0;
+        }
+
+    }
+#endif // #if DO_COMPUTE_GPU
+}
+
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+    switch (message)
+    {
+    case WM_PAINT:
+        {
+            PAINTSTRUCT ps;
+            HDC hdc = BeginPaint(hWnd, &ps);
+            EndPaint(hWnd, &ps);
+        }
+        break;
+    case WM_DESTROY:
+        PostQuitMessage(0);
+        break;
+    case WM_CHAR:
+        if (wParam == 'a')
+            s_Flags = s_Flags ^ kFlagAnimate;
+        if (wParam == 'p')
+        {
+            s_Flags = s_Flags ^ kFlagProgressive;
+            s_FrameCount = 0;
+        }
+        break;
+    default:
+        return DefWindowProc(hWnd, message, wParam, lParam);
+    }
+    return 0;
+}
+
+
+static HRESULT InitD3DDevice()
+{
+    HRESULT hr = S_OK;
+
+    RECT rc;
+    GetClientRect(g_Wnd, &rc);
+    UINT width = rc.right - rc.left;
+    UINT height = rc.bottom - rc.top;
+
+    UINT createDeviceFlags = 0;
+#ifdef _DEBUG
+    createDeviceFlags |= D3D11_CREATE_DEVICE_DEBUG;
+#endif
+
+    D3D_FEATURE_LEVEL featureLevels[] =
+    {
+        D3D_FEATURE_LEVEL_11_0,
+    };
+    UINT numFeatureLevels = ARRAYSIZE(featureLevels);
+    hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, createDeviceFlags, featureLevels, numFeatureLevels, D3D11_SDK_VERSION, &g_D3D11Device, &g_D3D11FeatureLevel, &g_D3D11Ctx);
+    if (FAILED(hr))
+        return hr;
+
+    // Get DXGI factory
+    IDXGIFactory1* dxgiFactory = nullptr;
+    {
+        IDXGIDevice* dxgiDevice = nullptr;
+        hr = g_D3D11Device->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast<void**>(&dxgiDevice));
+        if (SUCCEEDED(hr))
+        {
+            IDXGIAdapter* adapter = nullptr;
+            hr = dxgiDevice->GetAdapter(&adapter);
+            if (SUCCEEDED(hr))
+            {
+                hr = adapter->GetParent(__uuidof(IDXGIFactory1), reinterpret_cast<void**>(&dxgiFactory));
+                adapter->Release();
+            }
+            dxgiDevice->Release();
+        }
+    }
+    if (FAILED(hr))
+        return hr;
+
+    // Create swap chain
+    DXGI_SWAP_CHAIN_DESC sd;
+    ZeroMemory(&sd, sizeof(sd));
+    sd.BufferCount = 1;
+    sd.BufferDesc.Width = width;
+    sd.BufferDesc.Height = height;
+    sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM;
+    sd.BufferDesc.RefreshRate.Numerator = 60;
+    sd.BufferDesc.RefreshRate.Denominator = 1;
+    sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
+    sd.OutputWindow = g_Wnd;
+    sd.SampleDesc.Count = 1;
+    sd.SampleDesc.Quality = 0;
+    sd.Windowed = TRUE;
+    hr = dxgiFactory->CreateSwapChain(g_D3D11Device, &sd, &g_D3D11SwapChain);
+
+    // Prevent Alt-Enter
+    dxgiFactory->MakeWindowAssociation(g_Wnd, DXGI_MWA_NO_ALT_ENTER);
+    dxgiFactory->Release();
+
+    if (FAILED(hr))
+        return hr;
+
+    // RTV
+    ID3D11Texture2D* pBackBuffer = nullptr;
+    hr = g_D3D11SwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), reinterpret_cast<void**>(&pBackBuffer));
+    if (FAILED(hr))
+        return hr;
+    hr = g_D3D11Device->CreateRenderTargetView(pBackBuffer, nullptr, &g_D3D11RenderTarget);
+    pBackBuffer->Release();
+    if (FAILED(hr))
+        return hr;
+
+    g_D3D11Ctx->OMSetRenderTargets(1, &g_D3D11RenderTarget, nullptr);
+
+    // Viewport
+    D3D11_VIEWPORT vp;
+    vp.Width = (float)width;
+    vp.Height = (float)height;
+    vp.MinDepth = 0.0f;
+    vp.MaxDepth = 1.0f;
+    vp.TopLeftX = 0;
+    vp.TopLeftY = 0;
+    g_D3D11Ctx->RSSetViewports(1, &vp);
+
+    return S_OK;
+}
+
+static void ShutdownD3DDevice()
+{
+    if (g_D3D11Ctx) g_D3D11Ctx->ClearState();
+
+    if (g_D3D11RenderTarget) g_D3D11RenderTarget->Release();
+    if (g_D3D11SwapChain) g_D3D11SwapChain->Release();
+    if (g_D3D11Ctx) g_D3D11Ctx->Release();
+    if (g_D3D11Device) g_D3D11Device->Release();
+}
--- a/examples/ToyPathTracer/Windows/VertexShader.hlsl
+++ b/examples/ToyPathTracer/Windows/VertexShader.hlsl
@ -0,0 +1,13 @@
+struct vs2ps
+{
+    float2 uv : TEXCOORD0;
+    float4 pos : SV_Position;
+};
+
+vs2ps main(uint vid : SV_VertexID)
+{
+    vs2ps o;
+    o.uv = float2((vid << 1) & 2, vid & 2);
+    o.pos = float4(o.uv * float2(2, 2) + float2(-1, -1), 0, 1);
+    return o;
+}
--- a/examples/ToyPathTracer/license.md
+++ b/examples/ToyPathTracer/license.md
@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org>