From 0b1eff8b0d8bba03bfb5df19f23757793ff54b21 Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Tue, 29 Oct 2019 22:21:34 +0100 Subject: [PATCH] Add aras-p's ToyPathTracer. https://github.com/aras-p/ToyPathTracer b076563906169aa2f9e6d7218ef85decf81f8f72 --- examples/ToyPathTracer/README | 1 + examples/ToyPathTracer/Source/Config.h | 33 ++ examples/ToyPathTracer/Source/MathSimd.h | 192 +++++++ examples/ToyPathTracer/Source/Maths.cpp | 203 +++++++ examples/ToyPathTracer/Source/Maths.h | 436 ++++++++++++++ examples/ToyPathTracer/Source/Test.cpp | 380 ++++++++++++ examples/ToyPathTracer/Source/Test.h | 17 + .../ToyPathTracer/Source/enkiTS/Atomics.h | 79 +++ .../Source/enkiTS/LockLessMultiReadPipe.h | 240 ++++++++ .../Source/enkiTS/TaskScheduler.cpp | 437 ++++++++++++++ .../Source/enkiTS/TaskScheduler.h | 177 ++++++ .../Source/enkiTS/TaskScheduler_c.cpp | 122 ++++ .../Source/enkiTS/TaskScheduler_c.h | 104 ++++ .../ToyPathTracer/Source/enkiTS/Threads.h | 210 +++++++ .../ToyPathTracer/Windows/ComputeShader.hlsl | 395 +++++++++++++ .../ToyPathTracer/Windows/PixelShader.hlsl | 15 + examples/ToyPathTracer/Windows/TestCpu.sln | 31 + .../ToyPathTracer/Windows/TestCpu.vcxproj | 242 ++++++++ .../Windows/TestCpu.vcxproj.filters | 66 +++ examples/ToyPathTracer/Windows/TestWin.cpp | 540 ++++++++++++++++++ .../ToyPathTracer/Windows/VertexShader.hlsl | 13 + examples/ToyPathTracer/license.md | 24 + 22 files changed, 3957 insertions(+) create mode 100644 examples/ToyPathTracer/README create mode 100644 examples/ToyPathTracer/Source/Config.h create mode 100644 examples/ToyPathTracer/Source/MathSimd.h create mode 100644 examples/ToyPathTracer/Source/Maths.cpp create mode 100644 examples/ToyPathTracer/Source/Maths.h create mode 100644 examples/ToyPathTracer/Source/Test.cpp create mode 100644 examples/ToyPathTracer/Source/Test.h create mode 100644 examples/ToyPathTracer/Source/enkiTS/Atomics.h create mode 100644 examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h create mode 100644 examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp create mode 100644 examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h create mode 100644 examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp create mode 100644 examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h create mode 100644 examples/ToyPathTracer/Source/enkiTS/Threads.h create mode 100644 examples/ToyPathTracer/Windows/ComputeShader.hlsl create mode 100644 examples/ToyPathTracer/Windows/PixelShader.hlsl create mode 100644 examples/ToyPathTracer/Windows/TestCpu.sln create mode 100644 examples/ToyPathTracer/Windows/TestCpu.vcxproj create mode 100644 examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters create mode 100644 examples/ToyPathTracer/Windows/TestWin.cpp create mode 100644 examples/ToyPathTracer/Windows/VertexShader.hlsl create mode 100644 examples/ToyPathTracer/license.md diff --git a/examples/ToyPathTracer/README b/examples/ToyPathTracer/README new file mode 100644 index 00000000..6e33b790 --- /dev/null +++ b/examples/ToyPathTracer/README @@ -0,0 +1 @@ +https://github.com/aras-p/ToyPathTracer diff --git a/examples/ToyPathTracer/Source/Config.h b/examples/ToyPathTracer/Source/Config.h new file mode 100644 index 00000000..9feede1b --- /dev/null +++ b/examples/ToyPathTracer/Source/Config.h @@ -0,0 +1,33 @@ + +#if defined(__APPLE__) && !defined(__METAL_VERSION__) +#include +#endif + +#define kBackbufferWidth 1280 +#define kBackbufferHeight 720 + +#if defined(__EMSCRIPTEN__) +#define CPU_CAN_DO_SIMD 0 +#define CPU_CAN_DO_THREADS 0 +#else +#define CPU_CAN_DO_SIMD 1 +#define CPU_CAN_DO_THREADS 1 +#endif + + +#define DO_SAMPLES_PER_PIXEL 4 +#define DO_ANIMATE_SMOOTHING 0.9f +#define DO_LIGHT_SAMPLING 1 +#define DO_MITSUBA_COMPARE 0 + +// Should path tracing be done on the GPU with a compute shader? +#define DO_COMPUTE_GPU 0 +#define kCSGroupSizeX 8 +#define kCSGroupSizeY 8 +#define kCSMaxObjects 64 + +// Should float3 struct use SSE/NEON? +#define DO_FLOAT3_WITH_SIMD (!(DO_COMPUTE_GPU) && CPU_CAN_DO_SIMD && 1) + +// Should HitSpheres function use SSE/NEON? +#define DO_HIT_SPHERES_SIMD (CPU_CAN_DO_SIMD && 1) diff --git a/examples/ToyPathTracer/Source/MathSimd.h b/examples/ToyPathTracer/Source/MathSimd.h new file mode 100644 index 00000000..3e0a5cbe --- /dev/null +++ b/examples/ToyPathTracer/Source/MathSimd.h @@ -0,0 +1,192 @@ +#pragma once + +#if defined(_MSC_VER) +#define VM_INLINE __forceinline +#else +#define VM_INLINE __attribute__((unused, always_inline, nodebug)) inline +#endif + +#define kSimdWidth 4 + +#if !defined(__arm__) && !defined(__arm64__) && !defined(__EMSCRIPTEN__) + +// ---- SSE implementation + +#include +#include +#include + +#define SHUFFLE4(V, X,Y,Z,W) float4(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(W,Z,Y,X))) + +struct float4 +{ + VM_INLINE float4() {} + VM_INLINE explicit float4(const float *p) { m = _mm_loadu_ps(p); } + VM_INLINE explicit float4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); } + VM_INLINE explicit float4(float v) { m = _mm_set_ps1(v); } + VM_INLINE explicit float4(__m128 v) { m = v; } + + VM_INLINE float getX() const { return _mm_cvtss_f32(m); } + VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); } + VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); } + VM_INLINE float getW() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 3, 3, 3))); } + + __m128 m; +}; + +typedef float4 bool4; + +VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = _mm_add_ps(a.m, b.m); return a; } +VM_INLINE float4 operator- (float4 a, float4 b) { a.m = _mm_sub_ps(a.m, b.m); return a; } +VM_INLINE float4 operator* (float4 a, float4 b) { a.m = _mm_mul_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = _mm_and_ps(a.m, b.m); return a; } +VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = _mm_or_ps(a.m, b.m); return a; } +VM_INLINE float4 operator- (float4 a) { a.m = _mm_xor_ps(a.m, _mm_set1_ps(-0.0f)); return a; } +VM_INLINE float4 min(float4 a, float4 b) { a.m = _mm_min_ps(a.m, b.m); return a; } +VM_INLINE float4 max(float4 a, float4 b) { a.m = _mm_max_ps(a.m, b.m); return a; } + +VM_INLINE float hmin(float4 v) +{ + v = min(v, SHUFFLE4(v, 2, 3, 0, 0)); + v = min(v, SHUFFLE4(v, 1, 0, 0, 0)); + return v.getX(); +} + +// Returns a 4-bit code where bit0..bit3 is X..W +VM_INLINE unsigned mask(float4 v) { return _mm_movemask_ps(v.m); } +// Once we have a comparison, we can branch based on its results: +VM_INLINE bool any(bool4 v) { return mask(v) != 0; } +VM_INLINE bool all(bool4 v) { return mask(v) == 15; } + +// "select", i.e. hibit(cond) ? b : a +// on SSE4.1 and up this can be done easily via "blend" instruction; +// on older SSEs has to do a bunch of hoops, see +// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/ + +VM_INLINE float4 select(float4 a, float4 b, bool4 cond) +{ +#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1 + a.m = _mm_blendv_ps(a.m, b.m, cond.m); +#else + __m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31)); + a.m = _mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)); +#endif + return a; +} +VM_INLINE __m128i select(__m128i a, __m128i b, bool4 cond) +{ +#if defined(__SSE4_1__) || defined(_MSC_VER) // on windows assume we always have SSE4.1 + return _mm_blendv_epi8(a, b, _mm_castps_si128(cond.m)); +#else + __m128i d = _mm_srai_epi32(_mm_castps_si128(cond.m), 31); + return _mm_or_si128(_mm_and_si128(d, b), _mm_andnot_si128(d, a)); +#endif +} + +VM_INLINE float4 sqrtf(float4 v) { return float4(_mm_sqrt_ps(v.m)); } + +#elif !defined(__EMSCRIPTEN__) + +// ---- NEON implementation + +#define USE_NEON 1 +#include + +struct float4 +{ + VM_INLINE float4() {} + VM_INLINE explicit float4(const float *p) { m = vld1q_f32(p); } + VM_INLINE explicit float4(float x, float y, float z, float w) { float v[4] = {x, y, z, w}; m = vld1q_f32(v); } + VM_INLINE explicit float4(float v) { m = vdupq_n_f32(v); } + VM_INLINE explicit float4(float32x4_t v) { m = v; } + + VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); } + VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); } + VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); } + VM_INLINE float getW() const { return vgetq_lane_f32(m, 3); } + + float32x4_t m; +}; + +typedef float4 bool4; + +VM_INLINE float4 operator+ (float4 a, float4 b) { a.m = vaddq_f32(a.m, b.m); return a; } +VM_INLINE float4 operator- (float4 a, float4 b) { a.m = vsubq_f32(a.m, b.m); return a; } +VM_INLINE float4 operator* (float4 a, float4 b) { a.m = vmulq_f32(a.m, b.m); return a; } +VM_INLINE bool4 operator==(float4 a, float4 b) { a.m = vceqq_f32(a.m, b.m); return a; } +VM_INLINE bool4 operator!=(float4 a, float4 b) { a.m = a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; } +VM_INLINE bool4 operator< (float4 a, float4 b) { a.m = vcltq_f32(a.m, b.m); return a; } +VM_INLINE bool4 operator> (float4 a, float4 b) { a.m = vcgtq_f32(a.m, b.m); return a; } +VM_INLINE bool4 operator<=(float4 a, float4 b) { a.m = vcleq_f32(a.m, b.m); return a; } +VM_INLINE bool4 operator>=(float4 a, float4 b) { a.m = vcgeq_f32(a.m, b.m); return a; } +VM_INLINE bool4 operator&(bool4 a, bool4 b) { a.m = vandq_u32(a.m, b.m); return a; } +VM_INLINE bool4 operator|(bool4 a, bool4 b) { a.m = vorrq_u32(a.m, b.m); return a; } +VM_INLINE float4 operator- (float4 a) { a.m = vnegq_f32(a.m); return a; } +VM_INLINE float4 min(float4 a, float4 b) { a.m = vminq_f32(a.m, b.m); return a; } +VM_INLINE float4 max(float4 a, float4 b) { a.m = vmaxq_f32(a.m, b.m); return a; } + +VM_INLINE float hmin(float4 v) +{ + float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m)); + float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs); + return vget_lane_f32(minOfMinOfHalfs, 0); +} + +// Returns a 4-bit code where bit0..bit3 is X..W +VM_INLINE unsigned mask(float4 v) +{ + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + uint32x4_t t0 = vreinterpretq_u32_f32(v.m); + uint32x4_t t1 = vtstq_u32(t0, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +} +// Once we have a comparison, we can branch based on its results: +VM_INLINE bool any(bool4 v) { return mask(v) != 0; } +VM_INLINE bool all(bool4 v) { return mask(v) == 15; } + +// "select", i.e. hibit(cond) ? b : a +// on SSE4.1 and up this can be done easily via "blend" instruction; +// on older SSEs has to do a bunch of hoops, see +// https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/ + +VM_INLINE float4 select(float4 a, float4 b, bool4 cond) +{ + a.m = vbslq_f32(cond.m, b.m, a.m); + return a; +} +VM_INLINE int32x4_t select(int32x4_t a, int32x4_t b, bool4 cond) +{ + return vbslq_f32(cond.m, b, a); +} + +VM_INLINE float4 sqrtf(float4 v) +{ + float32x4_t V = v.m; + float32x4_t S0 = vrsqrteq_f32(V); + float32x4_t P0 = vmulq_f32( V, S0 ); + float32x4_t R0 = vrsqrtsq_f32( P0, S0 ); + float32x4_t S1 = vmulq_f32( S0, R0 ); + float32x4_t P1 = vmulq_f32( V, S1 ); + float32x4_t R1 = vrsqrtsq_f32( P1, S1 ); + float32x4_t S2 = vmulq_f32( S1, R1 ); + float32x4_t P2 = vmulq_f32( V, S2 ); + float32x4_t R2 = vrsqrtsq_f32( P2, S2 ); + float32x4_t S3 = vmulq_f32( S2, R2 ); + return float4(vmulq_f32(V, S3)); +} + +VM_INLINE float4 splatX(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 0)); } +VM_INLINE float4 splatY(float32x4_t v) { return float4(vdupq_lane_f32(vget_low_f32(v), 1)); } +VM_INLINE float4 splatZ(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 0)); } +VM_INLINE float4 splatW(float32x4_t v) { return float4(vdupq_lane_f32(vget_high_f32(v), 1)); } + +#endif diff --git a/examples/ToyPathTracer/Source/Maths.cpp b/examples/ToyPathTracer/Source/Maths.cpp new file mode 100644 index 00000000..ca875139 --- /dev/null +++ b/examples/ToyPathTracer/Source/Maths.cpp @@ -0,0 +1,203 @@ +#include "Maths.h" +#include +#include + +static uint32_t XorShift32(uint32_t& state) +{ + uint32_t x = state; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 15; + state = x; + return x; +} + +float RandomFloat01(uint32_t& state) +{ + return (XorShift32(state) & 0xFFFFFF) / 16777216.0f; +} + +float3 RandomInUnitDisk(uint32_t& state) +{ + float3 p; + do + { + p = 2.0 * float3(RandomFloat01(state),RandomFloat01(state),0) - float3(1,1,0); + } while (dot(p,p) >= 1.0); + return p; +} + +float3 RandomInUnitSphere(uint32_t& state) +{ + float3 p; + do { + p = 2.0*float3(RandomFloat01(state),RandomFloat01(state),RandomFloat01(state)) - float3(1,1,1); + } while (sqLength(p) >= 1.0); + return p; +} + +float3 RandomUnitVector(uint32_t& state) +{ + float z = RandomFloat01(state) * 2.0f - 1.0f; + float a = RandomFloat01(state) * 2.0f * kPI; + float r = sqrtf(1.0f - z * z); + float x = r * cosf(a); + float y = r * sinf(a); + return float3(x, y, z); +} + + +int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit) +{ +#if DO_HIT_SPHERES_SIMD + float4 hitT = float4(tMax); +#if USE_NEON + int32x4_t id = vdupq_n_s32(-1); +#else + __m128i id = _mm_set1_epi32(-1); +#endif + +#if DO_FLOAT3_WITH_SIMD && !USE_NEON + float4 rOrigX = SHUFFLE4(r.orig, 0, 0, 0, 0); + float4 rOrigY = SHUFFLE4(r.orig, 1, 1, 1, 1); + float4 rOrigZ = SHUFFLE4(r.orig, 2, 2, 2, 2); + float4 rDirX = SHUFFLE4(r.dir, 0, 0, 0, 0); + float4 rDirY = SHUFFLE4(r.dir, 1, 1, 1, 1); + float4 rDirZ = SHUFFLE4(r.dir, 2, 2, 2, 2); +#elif DO_FLOAT3_WITH_SIMD + float4 rOrigX = splatX(r.orig.m); + float4 rOrigY = splatY(r.orig.m); + float4 rOrigZ = splatZ(r.orig.m); + float4 rDirX = splatX(r.dir.m); + float4 rDirY = splatY(r.dir.m); + float4 rDirZ = splatZ(r.dir.m); +#else + float4 rOrigX = float4(r.orig.x); + float4 rOrigY = float4(r.orig.y); + float4 rOrigZ = float4(r.orig.z); + float4 rDirX = float4(r.dir.x); + float4 rDirY = float4(r.dir.y); + float4 rDirZ = float4(r.dir.z); +#endif + float4 tMin4 = float4(tMin); +#if USE_NEON + int32x4_t curId = vcombine_u32(vcreate_u32(0ULL | (1ULL<<32)), vcreate_u32(2ULL | (3ULL<<32))); +#else + __m128i curId = _mm_set_epi32(3, 2, 1, 0); +#endif + // process 4 spheres at once + for (int i = 0; i < spheres.simdCount; i += kSimdWidth) + { + // load data for 4 spheres + float4 sCenterX = float4(spheres.centerX + i); + float4 sCenterY = float4(spheres.centerY + i); + float4 sCenterZ = float4(spheres.centerZ + i); + float4 sSqRadius = float4(spheres.sqRadius + i); + // note: we flip this vector and calculate -b (nb) since that happens to be slightly preferable computationally + float4 coX = sCenterX - rOrigX; + float4 coY = sCenterY - rOrigY; + float4 coZ = sCenterZ - rOrigZ; + float4 nb = coX * rDirX + coY * rDirY + coZ * rDirZ; + float4 c = coX * coX + coY * coY + coZ * coZ - sSqRadius; + float4 discr = nb * nb - c; + bool4 discrPos = discr > float4(0.0f); + // if ray hits any of the 4 spheres + if (any(discrPos)) + { + float4 discrSq = sqrtf(discr); + + // ray could hit spheres at t0 & t1 + float4 t0 = nb - discrSq; + float4 t1 = nb + discrSq; + + float4 t = select(t1, t0, t0 > tMin4); // if t0 is above min, take it (since it's the earlier hit); else try t1. + bool4 msk = discrPos & (t > tMin4) & (t < hitT); + // if hit, take it + id = select(id, curId, msk); + hitT = select(hitT, t, msk); + } +#if USE_NEON + curId = vaddq_s32(curId, vdupq_n_s32(kSimdWidth)); +#else + curId = _mm_add_epi32(curId, _mm_set1_epi32(kSimdWidth)); +#endif + } + // now we have up to 4 hits, find and return closest one + float minT = hmin(hitT); + if (minT < tMax) // any actual hits? + { + int minMask = mask(hitT == float4(minT)); + if (minMask != 0) + { + int id_scalar[4]; + float hitT_scalar[4]; +#if USE_NEON + vst1q_s32(id_scalar, id); + vst1q_f32(hitT_scalar, hitT.m); +#else + _mm_storeu_si128((__m128i *)id_scalar, id); + _mm_storeu_ps(hitT_scalar, hitT.m); +#endif + + // In general, you would do this with a bit scan (first set/trailing zero count). + // But who cares, it's only 16 options. + static const int laneId[16] = + { + 0, 0, 1, 0, // 00xx + 2, 0, 1, 0, // 01xx + 3, 0, 1, 0, // 10xx + 2, 0, 1, 0, // 11xx + }; + + int lane = laneId[minMask]; + int hitId = id_scalar[lane]; + float finalHitT = hitT_scalar[lane]; + + outHit.pos = r.pointAt(finalHitT); + outHit.normal = (outHit.pos - float3(spheres.centerX[hitId], spheres.centerY[hitId], spheres.centerZ[hitId])) * spheres.invRadius[hitId]; + outHit.t = finalHitT; + return hitId; + } + } + + return -1; + +#else // #if DO_HIT_SPHERES_SIMD + + float hitT = tMax; + int id = -1; + for (int i = 0; i < spheres.count; ++i) + { + float coX = spheres.centerX[i] - r.orig.getX(); + float coY = spheres.centerY[i] - r.orig.getY(); + float coZ = spheres.centerZ[i] - r.orig.getZ(); + float nb = coX * r.dir.getX() + coY * r.dir.getY() + coZ * r.dir.getZ(); + float c = coX * coX + coY * coY + coZ * coZ - spheres.sqRadius[i]; + float discr = nb * nb - c; + if (discr > 0) + { + float discrSq = sqrtf(discr); + + // Try earlier t + float t = nb - discrSq; + if (t <= tMin) // before min, try later t! + t = nb + discrSq; + + if (t > tMin && t < hitT) + { + id = i; + hitT = t; + } + } + } + if (id != -1) + { + outHit.pos = r.pointAt(hitT); + outHit.normal = (outHit.pos - float3(spheres.centerX[id], spheres.centerY[id], spheres.centerZ[id])) * spheres.invRadius[id]; + outHit.t = hitT; + return id; + } + else + return -1; +#endif // #else of #if DO_HIT_SPHERES_SIMD +} diff --git a/examples/ToyPathTracer/Source/Maths.h b/examples/ToyPathTracer/Source/Maths.h new file mode 100644 index 00000000..b587715f --- /dev/null +++ b/examples/ToyPathTracer/Source/Maths.h @@ -0,0 +1,436 @@ +#pragma once + +#include +#include +#include +#include "Config.h" +#include "MathSimd.h" + +#define kPI 3.1415926f + +// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/ +#if DO_FLOAT3_WITH_SIMD + + +#if !defined(__arm__) && !defined(__arm64__) + +// ---- SSE implementation + +// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz). +// SHUFFLE3(v, 0,0,0) splats the X (v.xxx). +#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X))) + +struct float3 +{ + VM_INLINE float3() {} + VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); } + VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); } + VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); } + VM_INLINE explicit float3(__m128 v) { m = v; } + + VM_INLINE float getX() const { return _mm_cvtss_f32(m); } + VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); } + VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); } + + VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); } + VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); } + + VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); } + + void setX(float x) + { + m = _mm_move_ss(m, _mm_set_ss(x)); + } + void setY(float y) + { + __m128 t = _mm_move_ss(m, _mm_set_ss(y)); + t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0)); + m = _mm_move_ss(t, m); + } + void setZ(float z) + { + __m128 t = _mm_move_ss(m, _mm_set_ss(z)); + t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0)); + m = _mm_move_ss(t, m); + } + + __m128 m; +}; + +typedef float3 bool3; + +VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; } +VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; } +VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; } +VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; } +VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; } +VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; } +VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; } +VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; } +VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; } +VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; } +VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; } +VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; } +VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; } +VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; } +VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; } +VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; } +VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; } +VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; } +VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; } +VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; } +VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; } +VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; } + +VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; } + +VM_INLINE float hmin(float3 v) +{ + v = min(v, SHUFFLE3(v, 1, 0, 2)); + return min(v, SHUFFLE3(v, 2, 0, 1)).getX(); +} +VM_INLINE float hmax(float3 v) +{ + v = max(v, SHUFFLE3(v, 1, 0, 2)); + return max(v, SHUFFLE3(v, 2, 0, 1)).getX(); +} + +VM_INLINE float3 cross(float3 a, float3 b) +{ + // x <- a.y*b.z - a.z*b.y + // y <- a.z*b.x - a.x*b.z + // z <- a.x*b.y - a.y*b.x + // We can save a shuffle by grouping it in this wacky order: + return (a.zxy()*b - a*b.zxy()).zxy(); +} + +// Returns a 3-bit code where bit0..bit2 is X..Z +VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; } +// Once we have a comparison, we can branch based on its results: +VM_INLINE bool any(bool3 v) { return mask(v) != 0; } +VM_INLINE bool all(bool3 v) { return mask(v) == 7; } + +VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); } +VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); } +VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); } + +#else // #if !defined(__arm__) && !defined(__arm64__) + +// ---- NEON implementation + +#include + +struct float3 +{ + VM_INLINE float3() {} + VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); } + VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); } + VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); } + VM_INLINE explicit float3(float32x4_t v) { m = v; } + + VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); } + VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); } + VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); } + + VM_INLINE float3 yzx() const + { + float32x2_t low = vget_low_f32(m); + float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low); + return float3(yzx); + } + VM_INLINE float3 zxy() const + { + float32x4_t p = m; + p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1]; + return float3(p); + } + + VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); } + + void setX(float x) + { + m = vsetq_lane_f32(x, m, 0); + } + void setY(float y) + { + m = vsetq_lane_f32(y, m, 1); + } + void setZ(float z) + { + m = vsetq_lane_f32(z, m, 2); + } + + float32x4_t m; +}; + +typedef float3 bool3; + +VM_INLINE float32x4_t rcp_2(float32x4_t v) +{ + float32x4_t e = vrecpeq_f32(v); + e = vmulq_f32(vrecpsq_f32(e, v), e); + e = vmulq_f32(vrecpsq_f32(e, v), e); + return e; +} + +VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; } +VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; } +VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; } +VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; } +VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; } +VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; } +VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; } +VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; } +VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; } +VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; } +VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; } +VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; } +VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; } +VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; } +VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; } +VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; } +VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; } +VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; } +VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; } +VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; } +VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; } +VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; } + +VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; } + +VM_INLINE float hmin(float3 v) +{ + float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m)); + float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs); + return vget_lane_f32(minOfMinOfHalfs, 0); +} +VM_INLINE float hmax(float3 v) +{ + float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m)); + float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs); + return vget_lane_f32(maxOfMaxOfHalfs, 0); +} + +VM_INLINE float3 cross(float3 a, float3 b) +{ + // x <- a.y*b.z - a.z*b.y + // y <- a.z*b.x - a.x*b.z + // z <- a.x*b.y - a.y*b.x + // We can save a shuffle by grouping it in this wacky order: + return (a.zxy()*b - a*b.zxy()).zxy(); +} + +// Returns a 3-bit code where bit0..bit2 is X..Z +VM_INLINE unsigned mask(float3 v) +{ + static const uint32x4_t movemask = { 1, 2, 4, 8 }; + static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; + uint32x4_t t0 = vreinterpretq_u32_f32(v.m); + uint32x4_t t1 = vtstq_u32(t0, highbit); + uint32x4_t t2 = vandq_u32(t1, movemask); + uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2)); + return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1); +} +// Once we have a comparison, we can branch based on its results: +VM_INLINE bool any(bool3 v) { return mask(v) != 0; } +VM_INLINE bool all(bool3 v) { return mask(v) == 7; } + +VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); } +VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); } +VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); } + + +#endif // #else of #if !defined(__arm__) && !defined(__arm64__) + +#else // #if DO_FLOAT3_WITH_SIMD + +// ---- Simple scalar C implementation + + +struct float3 +{ + float3() : x(0), y(0), z(0) {} + float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {} + + float3 operator-() const { return float3(-x, -y, -z); } + float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; } + float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; } + float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; } + float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; } + + VM_INLINE float getX() const { return x; } + VM_INLINE float getY() const { return y; } + VM_INLINE float getZ() const { return z; } + VM_INLINE void setX(float x_) { x = x_; } + VM_INLINE void setY(float y_) { y = y_; } + VM_INLINE void setZ(float z_) { z = z_; } + VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); } + + float x, y, z; +}; + +VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); } +VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); } +VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); } +VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); } +VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); } +VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; } +VM_INLINE float3 cross(const float3& a, const float3& b) +{ + return float3( + a.y*b.z - a.z*b.y, + -(a.x*b.z - a.z*b.x), + a.x*b.y - a.y*b.x + ); +} +#endif // #else of #if DO_FLOAT3_WITH_SIMD + +VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); } +VM_INLINE float sqLength(float3 v) { return dot(v, v); } +VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); } +VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; } + + +inline void AssertUnit(float3 v) +{ + assert(fabsf(sqLength(v) - 1.0f) < 0.01f); +} + +inline float3 reflect(float3 v, float3 n) +{ + return v - 2*dot(v,n)*n; +} + +inline bool refract(float3 v, float3 n, float nint, float3& outRefracted) +{ + AssertUnit(v); + float dt = dot(v, n); + float discr = 1.0f - nint*nint*(1-dt*dt); + if (discr > 0) + { + outRefracted = nint * (v - n*dt) - n*sqrtf(discr); + return true; + } + return false; +} +inline float schlick(float cosine, float ri) +{ + float r0 = (1-ri) / (1+ri); + r0 = r0*r0; + return r0 + (1-r0)*powf(1-cosine, 5); +} + +struct Ray +{ + Ray() {} + Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); } + + float3 pointAt(float t) const { return orig + dir * t; } + + float3 orig; + float3 dir; +}; + + +struct Hit +{ + float3 pos; + float3 normal; + float t; +}; + + +struct Sphere +{ + Sphere() : radius(1.0f), invRadius(0.0f) {} + Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {} + + void UpdateDerivedData() { invRadius = 1.0f/radius; } + + float3 center; + float radius; + float invRadius; +}; + + +// data for all spheres in a "structure of arrays" layout +struct SpheresSoA +{ + SpheresSoA(int c) + { + count = c; + // we'll be processing spheres in kSimdWidth chunks, so make sure to allocate + // enough space + simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth; + centerX = new float[simdCount]; + centerY = new float[simdCount]; + centerZ = new float[simdCount]; + sqRadius = new float[simdCount]; + invRadius = new float[simdCount]; + // set all data to "impossible sphere" state + for (int i = count; i < simdCount; ++i) + { + centerX[i] = centerY[i] = centerZ[i] = 10000.0f; + sqRadius[i] = 0.0f; + invRadius[i] = 0.0f; + } + } + ~SpheresSoA() + { + delete[] centerX; + delete[] centerY; + delete[] centerZ; + delete[] sqRadius; + delete[] invRadius; + } + float* centerX; + float* centerY; + float* centerZ; + float* sqRadius; + float* invRadius; + int simdCount; + int count; +}; + + +int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit); + +float RandomFloat01(uint32_t& state); +float3 RandomInUnitDisk(uint32_t& state); +float3 RandomInUnitSphere(uint32_t& state); +float3 RandomUnitVector(uint32_t& state); + +struct Camera +{ + Camera() {} + // vfov is top to bottom in degrees + Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist) + { + lensRadius = aperture / 2; + float theta = vfov*kPI/180; + float halfHeight = tanf(theta/2); + float halfWidth = aspect * halfHeight; + origin = lookFrom; + w = normalize(lookFrom - lookAt); + u = normalize(cross(vup, w)); + v = cross(w, u); + lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w; + horizontal = 2*halfWidth*focusDist*u; + vertical = 2*halfHeight*focusDist*v; + } + + Ray GetRay(float s, float t, uint32_t& state) const + { + float3 rd = lensRadius * RandomInUnitDisk(state); + float3 offset = u * rd.getX() + v * rd.getY(); + return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset)); + } + + float3 origin; + float3 lowerLeftCorner; + float3 horizontal; + float3 vertical; + float3 u, v, w; + float lensRadius; +}; + diff --git a/examples/ToyPathTracer/Source/Test.cpp b/examples/ToyPathTracer/Source/Test.cpp new file mode 100644 index 00000000..8911281a --- /dev/null +++ b/examples/ToyPathTracer/Source/Test.cpp @@ -0,0 +1,380 @@ +#include "Config.h" +#include "Test.h" +#include "Maths.h" +#include +#if CPU_CAN_DO_THREADS +#include "enkiTS/TaskScheduler_c.h" +#endif +#include + +// 46 spheres (2 emissive) when enabled; 9 spheres (1 emissive) when disabled +#define DO_BIG_SCENE 1 + +static Sphere s_Spheres[] = +{ + {float3(0,-100.5,-1), 100}, + {float3(2,0,-1), 0.5f}, + {float3(0,0,-1), 0.5f}, + {float3(-2,0,-1), 0.5f}, + {float3(2,0,1), 0.5f}, + {float3(0,0,1), 0.5f}, + {float3(-2,0,1), 0.5f}, + {float3(0.5f,1,0.5f), 0.5f}, + {float3(-1.5f,1.5f,0.f), 0.3f}, +#if DO_BIG_SCENE + {float3(4,0,-3), 0.5f}, {float3(3,0,-3), 0.5f}, {float3(2,0,-3), 0.5f}, {float3(1,0,-3), 0.5f}, {float3(0,0,-3), 0.5f}, {float3(-1,0,-3), 0.5f}, {float3(-2,0,-3), 0.5f}, {float3(-3,0,-3), 0.5f}, {float3(-4,0,-3), 0.5f}, + {float3(4,0,-4), 0.5f}, {float3(3,0,-4), 0.5f}, {float3(2,0,-4), 0.5f}, {float3(1,0,-4), 0.5f}, {float3(0,0,-4), 0.5f}, {float3(-1,0,-4), 0.5f}, {float3(-2,0,-4), 0.5f}, {float3(-3,0,-4), 0.5f}, {float3(-4,0,-4), 0.5f}, + {float3(4,0,-5), 0.5f}, {float3(3,0,-5), 0.5f}, {float3(2,0,-5), 0.5f}, {float3(1,0,-5), 0.5f}, {float3(0,0,-5), 0.5f}, {float3(-1,0,-5), 0.5f}, {float3(-2,0,-5), 0.5f}, {float3(-3,0,-5), 0.5f}, {float3(-4,0,-5), 0.5f}, + {float3(4,0,-6), 0.5f}, {float3(3,0,-6), 0.5f}, {float3(2,0,-6), 0.5f}, {float3(1,0,-6), 0.5f}, {float3(0,0,-6), 0.5f}, {float3(-1,0,-6), 0.5f}, {float3(-2,0,-6), 0.5f}, {float3(-3,0,-6), 0.5f}, {float3(-4,0,-6), 0.5f}, + {float3(1.5f,1.5f,-2), 0.3f}, +#endif // #if DO_BIG_SCENE +}; +const int kSphereCount = sizeof(s_Spheres) / sizeof(s_Spheres[0]); + +static SpheresSoA s_SpheresSoA(kSphereCount); + +struct Material +{ + enum Type { Lambert, Metal, Dielectric }; + Type type; + float3 albedo; + float3 emissive; + float roughness; + float ri; +}; + +static Material s_SphereMats[kSphereCount] = +{ + { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, + { Material::Lambert, float3(0.8f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, + { Material::Lambert, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0, }, + { Material::Metal, float3(0.4f, 0.4f, 0.8f), float3(0,0,0), 0, 0 }, + { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0, 0 }, + { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.2f, 0 }, + { Material::Metal, float3(0.4f, 0.8f, 0.4f), float3(0,0,0), 0.6f, 0 }, + { Material::Dielectric, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 1.5f }, + { Material::Lambert, float3(0.8f, 0.6f, 0.2f), float3(30,25,15), 0, 0 }, +#if DO_BIG_SCENE + { Material::Lambert, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, }, + { Material::Metal, float3(0.1f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.2f, 0.2f, 0.2f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.3f, 0.3f, 0.3f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.4f, 0.4f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.5f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.6f, 0.6f, 0.6f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.7f, 0.7f, 0.7f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.9f, 0.9f, 0.9f), float3(0,0,0), 0, 0, }, + { Material::Metal, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, + { Material::Lambert, float3(0.8f, 0.1f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.5f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.8f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.4f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.1f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.5f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.8f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Lambert, float3(0.1f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, { Material::Metal, float3(0.5f, 0.1f, 0.8f), float3(0,0,0), 0, 0, }, + { Material::Lambert, float3(0.1f, 0.2f, 0.5f), float3(3,10,20), 0, 0 }, +#endif +}; + +static int s_EmissiveSpheres[kSphereCount]; +static int s_EmissiveSphereCount; + +static Camera s_Cam; + +const float kMinT = 0.001f; +const float kMaxT = 1.0e7f; +const int kMaxDepth = 10; + + +bool HitWorld(const Ray& r, float tMin, float tMax, Hit& outHit, int& outID) +{ + outID = HitSpheres(r, s_SpheresSoA, tMin, tMax, outHit); + return outID != -1; +} + + +static bool Scatter(const Material& mat, const Ray& r_in, const Hit& rec, float3& attenuation, Ray& scattered, float3& outLightE, int& inoutRayCount, uint32_t& state) +{ + outLightE = float3(0,0,0); + if (mat.type == Material::Lambert) + { + // random point on unit sphere that is tangent to the hit point + float3 target = rec.pos + rec.normal + RandomUnitVector(state); + scattered = Ray(rec.pos, normalize(target - rec.pos)); + attenuation = mat.albedo; + + // sample lights +#if DO_LIGHT_SAMPLING + for (int j = 0; j < s_EmissiveSphereCount; ++j) + { + int i = s_EmissiveSpheres[j]; + const Material& smat = s_SphereMats[i]; + if (&mat == &smat) + continue; // skip self + const Sphere& s = s_Spheres[i]; + + // create a random direction towards sphere + // coord system for sampling: sw, su, sv + float3 sw = normalize(s.center - rec.pos); + float3 su = normalize(cross(fabs(sw.getX())>0.01f ? float3(0,1,0):float3(1,0,0), sw)); + float3 sv = cross(sw, su); + // sample sphere by solid angle + float cosAMax = sqrtf(1.0f - s.radius*s.radius / sqLength(rec.pos-s.center)); + float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state); + float cosA = 1.0f - eps1 + eps1 * cosAMax; + float sinA = sqrtf(1.0f - cosA*cosA); + float phi = 2 * kPI * eps2; + float3 l = su * (cosf(phi) * sinA) + sv * (sinf(phi) * sinA) + sw * cosA; + //l = normalize(l); // NOTE(fg): This is already normalized, by construction. + + // shoot shadow ray + Hit lightHit; + int hitID; + ++inoutRayCount; + if (HitWorld(Ray(rec.pos, l), kMinT, kMaxT, lightHit, hitID) && hitID == i) + { + float omega = 2 * kPI * (1-cosAMax); + + float3 rdir = r_in.dir; + AssertUnit(rdir); + float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal; + outLightE += (mat.albedo * smat.emissive) * (std::max(0.0f, dot(l, nl)) * omega / kPI); + } + } +#endif + return true; + } + else if (mat.type == Material::Metal) + { + AssertUnit(r_in.dir); AssertUnit(rec.normal); + float3 refl = reflect(r_in.dir, rec.normal); + // reflected ray, and random inside of sphere based on roughness + float roughness = mat.roughness; +#if DO_MITSUBA_COMPARE + roughness = 0; // until we get better BRDF for metals +#endif + scattered = Ray(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state))); + attenuation = mat.albedo; + return dot(scattered.dir, rec.normal) > 0; + } + else if (mat.type == Material::Dielectric) + { + AssertUnit(r_in.dir); AssertUnit(rec.normal); + float3 outwardN; + float3 rdir = r_in.dir; + float3 refl = reflect(rdir, rec.normal); + float nint; + attenuation = float3(1,1,1); + float3 refr; + float reflProb; + float cosine; + if (dot(rdir, rec.normal) > 0) + { + outwardN = -rec.normal; + nint = mat.ri; + cosine = mat.ri * dot(rdir, rec.normal); + } + else + { + outwardN = rec.normal; + nint = 1.0f / mat.ri; + cosine = -dot(rdir, rec.normal); + } + if (refract(rdir, outwardN, nint, refr)) + { + reflProb = schlick(cosine, mat.ri); + } + else + { + reflProb = 1; + } + if (RandomFloat01(state) < reflProb) + scattered = Ray(rec.pos, normalize(refl)); + else + scattered = Ray(rec.pos, normalize(refr)); + } + else + { + attenuation = float3(1,0,1); + return false; + } + return true; +} + +static float3 Trace(const Ray& r, int depth, int& inoutRayCount, uint32_t& state, bool doMaterialE = true) +{ + Hit rec; + int id = 0; + ++inoutRayCount; + if (HitWorld(r, kMinT, kMaxT, rec, id)) + { + Ray scattered; + float3 attenuation; + float3 lightE; + const Material& mat = s_SphereMats[id]; + float3 matE = mat.emissive; + if (depth < kMaxDepth && Scatter(mat, r, rec, attenuation, scattered, lightE, inoutRayCount, state)) + { +#if DO_LIGHT_SAMPLING + if (!doMaterialE) matE = float3(0,0,0); // don't add material emission if told so + // dor Lambert materials, we just did explicit light (emissive) sampling and already + // for their contribution, so if next ray bounce hits the light again, don't add + // emission + doMaterialE = (mat.type != Material::Lambert); +#endif + return matE + lightE + attenuation * Trace(scattered, depth+1, inoutRayCount, state, doMaterialE); + } + else + { + return matE; + } + } + else + { + // sky +#if DO_MITSUBA_COMPARE + return float3(0.15f,0.21f,0.3f); // easier compare with Mitsuba's constant environment light +#else + float3 unitDir = r.dir; + float t = 0.5f*(unitDir.getY() + 1.0f); + return ((1.0f-t)*float3(1.0f, 1.0f, 1.0f) + t*float3(0.5f, 0.7f, 1.0f)) * 0.3f; +#endif + } +} + +#if CPU_CAN_DO_THREADS +static enkiTaskScheduler* g_TS; +#endif + +void InitializeTest() +{ + #if CPU_CAN_DO_THREADS + g_TS = enkiNewTaskScheduler(); + enkiInitTaskScheduler(g_TS); + #endif +} + +void ShutdownTest() +{ + #if CPU_CAN_DO_THREADS + enkiDeleteTaskScheduler(g_TS); + #endif +} + +struct JobData +{ + float time; + int frameCount; + int screenWidth, screenHeight; + float* backbuffer; + Camera* cam; + std::atomic rayCount; + unsigned testFlags; +}; + +static void TraceRowJob(uint32_t start, uint32_t end, uint32_t threadnum, void* data_) +{ + JobData& data = *(JobData*)data_; + float* backbuffer = data.backbuffer + start * data.screenWidth * 4; + float invWidth = 1.0f / data.screenWidth; + float invHeight = 1.0f / data.screenHeight; + float lerpFac = float(data.frameCount) / float(data.frameCount+1); + if (data.testFlags & kFlagAnimate) + lerpFac *= DO_ANIMATE_SMOOTHING; + if (!(data.testFlags & kFlagProgressive)) + lerpFac = 0; + int rayCount = 0; + for (uint32_t y = start; y < end; ++y) + { + uint32_t state = (y * 9781 + data.frameCount * 6271) | 1; + for (int x = 0; x < data.screenWidth; ++x) + { + float3 col(0, 0, 0); + for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++) + { + float u = float(x + RandomFloat01(state)) * invWidth; + float v = float(y + RandomFloat01(state)) * invHeight; + Ray r = data.cam->GetRay(u, v, state); + col += Trace(r, 0, rayCount, state); + } + col *= 1.0f / float(DO_SAMPLES_PER_PIXEL); + + float3 prev(backbuffer[0], backbuffer[1], backbuffer[2]); + col = prev * lerpFac + col * (1-lerpFac); + col.store(backbuffer); + backbuffer += 4; + } + } + data.rayCount += rayCount; +} + +void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags) +{ + if (testFlags & kFlagAnimate) + { + s_Spheres[1].center.setY(cosf(time) + 1.0f); + s_Spheres[8].center.setZ(sinf(time)*0.3f); + } + float3 lookfrom(0, 2, 3); + float3 lookat(0, 0, 0); + float distToFocus = 3; +#if DO_MITSUBA_COMPARE + float aperture = 0.0f; +#else + float aperture = 0.1f; +#endif +#if DO_BIG_SCENE + aperture *= 0.2f; +#endif + + s_EmissiveSphereCount = 0; + for (int i = 0; i < kSphereCount; ++i) + { + Sphere& s = s_Spheres[i]; + s.UpdateDerivedData(); + s_SpheresSoA.centerX[i] = s.center.getX(); + s_SpheresSoA.centerY[i] = s.center.getY(); + s_SpheresSoA.centerZ[i] = s.center.getZ(); + s_SpheresSoA.sqRadius[i] = s.radius * s.radius; + s_SpheresSoA.invRadius[i] = s.invRadius; + + // Remember IDs of emissive spheres (light sources) + const Material& smat = s_SphereMats[i]; + if (smat.emissive.getX() > 0 || smat.emissive.getY() > 0 || smat.emissive.getZ() > 0) + { + s_EmissiveSpheres[s_EmissiveSphereCount] = i; + s_EmissiveSphereCount++; + } + } + + s_Cam = Camera(lookfrom, lookat, float3(0, 1, 0), 60, float(screenWidth) / float(screenHeight), aperture, distToFocus); +} + +void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags) +{ + JobData args; + args.time = time; + args.frameCount = frameCount; + args.screenWidth = screenWidth; + args.screenHeight = screenHeight; + args.backbuffer = backbuffer; + args.cam = &s_Cam; + args.testFlags = testFlags; + args.rayCount = 0; + + #if CPU_CAN_DO_THREADS + enkiTaskSet* task = enkiCreateTaskSet(g_TS, TraceRowJob); + bool threaded = true; + enkiAddTaskSetToPipeMinRange(g_TS, task, &args, screenHeight, threaded ? 4 : screenHeight); + enkiWaitForTaskSet(g_TS, task); + enkiDeleteTaskSet(task); + #else + TraceRowJob(0, screenHeight, 0, &args); + #endif + + outRayCount = args.rayCount; +} + +void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize) +{ + outCount = kSphereCount; + outObjectSize = sizeof(Sphere); + outMaterialSize = sizeof(Material); + outCamSize = sizeof(Camera); +} + +void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount) +{ + memcpy(outObjects, s_Spheres, kSphereCount * sizeof(s_Spheres[0])); + memcpy(outMaterials, s_SphereMats, kSphereCount * sizeof(s_SphereMats[0])); + memcpy(outCam, &s_Cam, sizeof(s_Cam)); + memcpy(outEmissives, s_EmissiveSpheres, s_EmissiveSphereCount * sizeof(s_EmissiveSpheres[0])); + *outEmissiveCount = s_EmissiveSphereCount; +} diff --git a/examples/ToyPathTracer/Source/Test.h b/examples/ToyPathTracer/Source/Test.h new file mode 100644 index 00000000..c085b5d5 --- /dev/null +++ b/examples/ToyPathTracer/Source/Test.h @@ -0,0 +1,17 @@ +#pragma once +#include + +enum TestFlags +{ + kFlagAnimate = (1 << 0), + kFlagProgressive = (1 << 1), +}; + +void InitializeTest(); +void ShutdownTest(); + +void UpdateTest(float time, int frameCount, int screenWidth, int screenHeight, unsigned testFlags); +void DrawTest(float time, int frameCount, int screenWidth, int screenHeight, float* backbuffer, int& outRayCount, unsigned testFlags); + +void GetObjectCount(int& outCount, int& outObjectSize, int& outMaterialSize, int& outCamSize); +void GetSceneDesc(void* outObjects, void* outMaterials, void* outCam, void* outEmissives, int* outEmissiveCount); diff --git a/examples/ToyPathTracer/Source/enkiTS/Atomics.h b/examples/ToyPathTracer/Source/enkiTS/Atomics.h new file mode 100644 index 00000000..878572ba --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/Atomics.h @@ -0,0 +1,79 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#pragma once + +#include + +#ifdef _WIN32 + #define WIN32_LEAN_AND_MEAN + #include + #undef GetObject + #include + + extern "C" void _ReadWriteBarrier(); + #pragma intrinsic(_ReadWriteBarrier) + #pragma intrinsic(_InterlockedCompareExchange) + #pragma intrinsic(_InterlockedExchangeAdd) + + // Memory Barriers to prevent CPU and Compiler re-ordering + #define BASE_MEMORYBARRIER_ACQUIRE() _ReadWriteBarrier() + #define BASE_MEMORYBARRIER_RELEASE() _ReadWriteBarrier() + #define BASE_ALIGN(x) __declspec( align( x ) ) + +#else + #define BASE_MEMORYBARRIER_ACQUIRE() __asm__ __volatile__("": : :"memory") + #define BASE_MEMORYBARRIER_RELEASE() __asm__ __volatile__("": : :"memory") + #define BASE_ALIGN(x) __attribute__ ((aligned( x ))) +#endif + +namespace enki +{ + // Atomically performs: if( *pDest == compareWith ) { *pDest = swapTo; } + // returns old *pDest (so if successfull, returns compareWith) + inline uint32_t AtomicCompareAndSwap( volatile uint32_t* pDest, uint32_t swapTo, uint32_t compareWith ) + { + #ifdef _WIN32 + // assumes two's complement - unsigned / signed conversion leads to same bit pattern + return _InterlockedCompareExchange( (volatile long*)pDest,swapTo, compareWith ); + #else + return __sync_val_compare_and_swap( pDest, compareWith, swapTo ); + #endif + } + + inline uint64_t AtomicCompareAndSwap( volatile uint64_t* pDest, uint64_t swapTo, uint64_t compareWith ) + { + #ifdef _WIN32 + // assumes two's complement - unsigned / signed conversion leads to same bit pattern + return _InterlockedCompareExchange64( (__int64 volatile*)pDest, swapTo, compareWith ); + #else + return __sync_val_compare_and_swap( pDest, compareWith, swapTo ); + #endif + } + + // Atomically performs: tmp = *pDest; *pDest += value; return tmp; + inline int32_t AtomicAdd( volatile int32_t* pDest, int32_t value ) + { + #ifdef _WIN32 + return _InterlockedExchangeAdd( (long*)pDest, value ); + #else + return __sync_fetch_and_add( pDest, value ); + #endif + } + +} \ No newline at end of file diff --git a/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h b/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h new file mode 100644 index 00000000..7439d09c --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/LockLessMultiReadPipe.h @@ -0,0 +1,240 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#pragma once + +#include +#include + +#include "Atomics.h" +#include + + +namespace enki +{ + // LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming + // Readers can only read from the back of the pipe + // The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader) + // for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx + // Note: using log2 sizes so we do not need to clamp (multi-operation) + // T is the contained type + // Note this is not true lockless as the use of flags as a form of lock state. + template class LockLessMultiReadPipe + { + public: + LockLessMultiReadPipe(); + ~LockLessMultiReadPipe() {} + + // ReaderTryReadBack returns false if we were unable to read + // This is thread safe for both multiple readers and the writer + bool ReaderTryReadBack( T* pOut ); + + // WriterTryReadFront returns false if we were unable to read + // This is thread safe for the single writer, but should not be called by readers + bool WriterTryReadFront( T* pOut ); + + // WriterTryWriteFront returns false if we were unable to write + // This is thread safe for the single writer, but should not be called by readers + bool WriterTryWriteFront( const T& in ); + + // IsPipeEmpty() is a utility function, not intended for general use + // Should only be used very prudently. + bool IsPipeEmpty() const + { + return 0 == m_WriteIndex - m_ReadCount; + } + + void Clear() + { + m_WriteIndex = 0; + m_ReadIndex = 0; + m_ReadCount = 0; + memset( (void*)m_Flags, 0, sizeof( m_Flags ) ); + } + + private: + const static uint32_t ms_cSize = ( 1 << cSizeLog2 ); + const static uint32_t ms_cIndexMask = ms_cSize - 1; + const static uint32_t FLAG_INVALID = 0xFFFFFFFF; // 32bit for CAS + const static uint32_t FLAG_CAN_WRITE = 0x00000000; // 32bit for CAS + const static uint32_t FLAG_CAN_READ = 0x11111111; // 32bit for CAS + + T m_Buffer[ ms_cSize ]; + + // read and write indexes allow fast access to the pipe, but actual access + // controlled by the access flags. + volatile uint32_t BASE_ALIGN(4) m_WriteIndex; + volatile uint32_t BASE_ALIGN(4) m_ReadCount; + volatile uint32_t m_Flags[ ms_cSize ]; + volatile uint32_t BASE_ALIGN(4) m_ReadIndex; + }; + + template inline + LockLessMultiReadPipe::LockLessMultiReadPipe() + : m_WriteIndex(0) + , m_ReadIndex(0) + , m_ReadCount(0) + { + assert( cSizeLog2 < 32 ); + memset( (void*)m_Flags, 0, sizeof( m_Flags ) ); + } + + template inline + bool LockLessMultiReadPipe::ReaderTryReadBack( T* pOut ) + { + + uint32_t actualReadIndex; + + uint32_t readCount = m_ReadCount; + + // We get hold of read index for consistency, + // and do first pass starting at read count + uint32_t readIndexToUse = readCount; + + + while(true) + { + + uint32_t writeIndex = m_WriteIndex; + // power of two sizes ensures we can use a simple calc without modulus + uint32_t numInPipe = writeIndex - readCount; + if( 0 == numInPipe ) + { + return false; + } + if( readIndexToUse >= writeIndex ) + { + // move back to start + readIndexToUse = m_ReadIndex; + } + + + // power of two sizes ensures we can perform AND for a modulus + actualReadIndex = readIndexToUse & ms_cIndexMask; + + // Multiple potential readers mean we should check if the data is valid, + // using an atomic compare exchange + uint32_t previous = AtomicCompareAndSwap( &m_Flags[ actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ ); + if( FLAG_CAN_READ == previous ) + { + break; + } + ++readIndexToUse; + + //update known readcount + readCount = m_ReadCount; + } + + // we update the read index using an atomic add, as we've only read one piece of data. + // this ensure consistency of the read index, and the above loop ensures readers + // only read from unread data + AtomicAdd( (volatile int32_t*)&m_ReadCount, 1 ); + + BASE_MEMORYBARRIER_ACQUIRE(); + // now read data, ensuring we do so after above reads & CAS + *pOut = m_Buffer[ actualReadIndex ]; + + m_Flags[ actualReadIndex ] = FLAG_CAN_WRITE; + + return true; + } + + template inline + bool LockLessMultiReadPipe::WriterTryReadFront( T* pOut ) + { + uint32_t writeIndex = m_WriteIndex; + uint32_t frontReadIndex = writeIndex; + + // Multiple potential readers mean we should check if the data is valid, + // using an atomic compare exchange - which acts as a form of lock (so not quite lockless really). + uint32_t previous = FLAG_INVALID; + uint32_t actualReadIndex = 0; + while( true ) + { + // power of two sizes ensures we can use a simple calc without modulus + uint32_t readCount = m_ReadCount; + uint32_t numInPipe = writeIndex - readCount; + if( 0 == numInPipe || 0 == frontReadIndex ) + { + // frontReadIndex can get to 0 here if that item was just being read by another thread. + m_ReadIndex = readCount; + return false; + } + --frontReadIndex; + actualReadIndex = frontReadIndex & ms_cIndexMask; + previous = AtomicCompareAndSwap( &m_Flags[ actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ ); + if( FLAG_CAN_READ == previous ) + { + break; + } + else if( m_ReadIndex >= frontReadIndex ) + { + return false; + } + } + + // now read data, ensuring we do so after above reads & CAS + *pOut = m_Buffer[ actualReadIndex ]; + + m_Flags[ actualReadIndex ] = FLAG_CAN_WRITE; + + BASE_MEMORYBARRIER_RELEASE(); + + // 32-bit aligned stores are atomic, and writer owns the write index + // we only move one back as this is as many as we have read, not where we have read from. + --m_WriteIndex; + return true; + } + + + template inline + bool LockLessMultiReadPipe::WriterTryWriteFront( const T& in ) + { + // The writer 'owns' the write index, and readers can only reduce + // the amount of data in the pipe. + // We get hold of both values for consistency and to reduce false sharing + // impacting more than one access + uint32_t writeIndex = m_WriteIndex; + + + // power of two sizes ensures we can perform AND for a modulus + uint32_t actualWriteIndex = writeIndex & ms_cIndexMask; + + // a reader may still be reading this item, as there are multiple readers + if( m_Flags[ actualWriteIndex ] != FLAG_CAN_WRITE ) + { + return false; // still being read, so have caught up with tail. + } + + + // as we are the only writer we can update the data without atomics + // whilst the write index has not been updated + m_Buffer[ actualWriteIndex ] = in; + m_Flags[ actualWriteIndex ] = FLAG_CAN_READ; + + // We need to ensure the above writes occur prior to updating the write index, + // otherwise another thread might read before it's finished + BASE_MEMORYBARRIER_RELEASE(); + + // 32-bit aligned stores are atomic, and the writer controls the write index + ++writeIndex; + m_WriteIndex = writeIndex; + return true; + } + +} diff --git a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp new file mode 100644 index 00000000..187673ad --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp @@ -0,0 +1,437 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#include + +#include "TaskScheduler.h" +#include "LockLessMultiReadPipe.h" + + + +using namespace enki; + + +static const uint32_t PIPESIZE_LOG2 = 8; +static const uint32_t SPIN_COUNT = 100; +static const uint32_t SPIN_BACKOFF_MULTIPLIER = 10; +static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8; + +// each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable +static THREAD_LOCAL uint32_t gtl_threadNum = 0; + +namespace enki +{ + struct SubTaskSet + { + ITaskSet* pTask; + TaskSetPartition partition; + }; + + // we derive class TaskPipe rather than typedef to get forward declaration working easily + class TaskPipe : public LockLessMultiReadPipe {}; + + struct ThreadArgs + { + uint32_t threadNum; + TaskScheduler* pTaskScheduler; + }; +} + +namespace +{ + SubTaskSet SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ ) + { + SubTaskSet splitTask = subTask_; + uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start; + + if( rangeToSplit_ > rangeLeft ) + { + rangeToSplit_ = rangeLeft; + } + splitTask.partition.end = subTask_.partition.start + rangeToSplit_; + subTask_.partition.start = splitTask.partition.end; + return splitTask; + } + + #if defined _WIN32 + #if defined _M_IX86 || defined _M_X64 + #pragma intrinsic(_mm_pause) + inline void Pause() { _mm_pause(); } + #endif + #elif defined __i386__ || defined __x86_64__ + inline void Pause() { __asm__ __volatile__("pause;"); } + #else + inline void Pause() { ;} // may have NOP or yield equiv + #endif +} + + +static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_) +{ + if( func_ ) + { + func_(threadnum_); + } +} + +ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks() +{ + return &m_ProfilerCallbacks; +} + +THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs ) +{ + ThreadArgs args = *(ThreadArgs*)pArgs; + uint32_t threadNum = args.threadNum; + TaskScheduler* pTS = args.pTaskScheduler; + gtl_threadNum = threadNum; + + SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum ); + + uint32_t spinCount = 0; + uint32_t hintPipeToCheck_io = threadNum + 1; // does not need to be clamped. + while( pTS->m_bRunning ) + { + if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) ) + { + // no tasks, will spin then wait + ++spinCount; + if( spinCount > SPIN_COUNT ) + { + pTS->WaitForTasks( threadNum ); + spinCount = 0; + } + else + { + uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER; + while( spinBackoffCount ) + { + Pause(); + --spinBackoffCount; + } + } + } + else + { + spinCount = 0; + } + } + + AtomicAdd( &pTS->m_NumThreadsRunning, -1 ); + SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum ); + + return 0; +} + + +void TaskScheduler::StartThreads() +{ + if( m_bHaveThreads ) + { + return; + } + m_bRunning = true; + + SemaphoreCreate( m_NewTaskSemaphore ); + + // we create one less thread than m_NumThreads as the main thread counts as one + m_pThreadNumStore = new ThreadArgs[m_NumThreads]; + m_pThreadIDs = new threadid_t[m_NumThreads]; + m_pThreadNumStore[0].threadNum = 0; + m_pThreadNumStore[0].pTaskScheduler = this; + m_pThreadIDs[0] = 0; + m_NumThreadsWaiting = 0; + m_NumThreadsRunning = 1;// acount for main thread + for( uint32_t thread = 1; thread < m_NumThreads; ++thread ) + { + m_pThreadNumStore[thread].threadNum = thread; + m_pThreadNumStore[thread].pTaskScheduler = this; + ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] ); + ++m_NumThreadsRunning; + } + + // ensure we have sufficient tasks to equally fill either all threads including main + // or just the threads we've launched, this is outside the firstinit as we want to be able + // to runtime change it + if( 1 == m_NumThreads ) + { + m_NumPartitions = 1; + m_NumInitialPartitions = 1; + } + else + { + m_NumPartitions = m_NumThreads * (m_NumThreads - 1); + m_NumInitialPartitions = m_NumThreads - 1; + if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS ) + { + m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS; + } + } + + m_bHaveThreads = true; +} + +void TaskScheduler::StopThreads( bool bWait_ ) +{ + if( m_bHaveThreads ) + { + // wait for them threads quit before deleting data + m_bRunning = false; + while( bWait_ && m_NumThreadsRunning > 1 ) + { + // keep firing event to ensure all threads pick up state of m_bRunning + SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning ); + } + + for( uint32_t thread = 1; thread < m_NumThreads; ++thread ) + { + ThreadTerminate( m_pThreadIDs[thread] ); + } + + m_NumThreads = 0; + delete[] m_pThreadNumStore; + delete[] m_pThreadIDs; + m_pThreadNumStore = 0; + m_pThreadIDs = 0; + SemaphoreClose( m_NewTaskSemaphore ); + + m_bHaveThreads = false; + m_NumThreadsWaiting = 0; + m_NumThreadsRunning = 0; + } +} + +bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ ) +{ + // check for tasks + SubTaskSet subTask; + bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask ); + + uint32_t threadToCheck = hintPipeToCheck_io_; + uint32_t checkCount = 0; + while( !bHaveTask && checkCount < m_NumThreads ) + { + threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads; + if( threadToCheck != threadNum ) + { + bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask ); + } + ++checkCount; + } + + if( bHaveTask ) + { + // update hint, will preserve value unless actually got task from another thread. + hintPipeToCheck_io_ = threadToCheck; + + uint32_t partitionSize = subTask.partition.end - subTask.partition.start; + if( subTask.pTask->m_RangeToRun < partitionSize ) + { + SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun ); + SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 ); + taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum ); + AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 ); + } + else + { + + // the task has already been divided up by AddTaskSetToPipe, so just run it + subTask.pTask->ExecuteRange( subTask.partition, threadNum ); + AtomicAdd( &subTask.pTask->m_RunningCount, -1 ); + } + } + + return bHaveTask; + +} + +void TaskScheduler::WaitForTasks( uint32_t threadNum ) +{ + // We incrememt the number of threads waiting here in order + // to ensure that the check for tasks occurs after the increment + // to prevent a task being added after a check, then the thread waiting. + // This will occasionally result in threads being mistakenly awoken, + // but they will then go back to sleep. + AtomicAdd( &m_NumThreadsWaiting, 1 ); + + bool bHaveTasks = false; + for( uint32_t thread = 0; thread < m_NumThreads; ++thread ) + { + if( !m_pPipesPerThread[ thread ].IsPipeEmpty() ) + { + bHaveTasks = true; + break; + } + } + if( !bHaveTasks ) + { + SafeCallback( m_ProfilerCallbacks.waitStart, threadNum ); + SemaphoreWait( m_NewTaskSemaphore ); + SafeCallback( m_ProfilerCallbacks.waitStop, threadNum ); + } + + int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 ); + assert( prev != 0 ); +} + +void TaskScheduler::WakeThreads() +{ + SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting ); +} + +void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_, + uint32_t rangeToSplit_, int32_t runningCountOffset_ ) +{ + int32_t numAdded = 0; + while( subTask_.partition.start != subTask_.partition.end ) + { + SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ ); + + // add the partition to the pipe + ++numAdded; + if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) ) + { + if( numAdded > 1 ) + { + WakeThreads(); + } + // alter range to run the appropriate fraction + if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ ) + { + taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun; + subTask_.partition.start = taskToAdd.partition.end; + } + taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ ); + --numAdded; + } + } + + // increment running count by number added + AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ ); + + WakeThreads(); +} + +void TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet ) +{ + // set running count to -1 to guarantee it won't be found complete until all subtasks added + pTaskSet->m_RunningCount = -1; + + // divide task up and add to pipe + pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions; + if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; } + + uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions; + if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; } + + SubTaskSet subTask; + subTask.pTask = pTaskSet; + subTask.partition.start = 0; + subTask.partition.end = pTaskSet->m_SetSize; + SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 ); +} + +void TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet ) +{ + uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped. + if( pTaskSet ) + { + while( pTaskSet->m_RunningCount ) + { + TryRunTask( gtl_threadNum, hintPipeToCheck_io ); + // should add a spin then wait for task completion event. + } + } + else + { + TryRunTask( gtl_threadNum, hintPipeToCheck_io ); + } +} + +void TaskScheduler::WaitforAll() +{ + bool bHaveTasks = true; + uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped. + int32_t threadsRunning = m_NumThreadsRunning - 1; + while( bHaveTasks || m_NumThreadsWaiting < threadsRunning ) + { + TryRunTask( gtl_threadNum, hintPipeToCheck_io ); + bHaveTasks = false; + for( uint32_t thread = 0; thread < m_NumThreads; ++thread ) + { + if( !m_pPipesPerThread[ thread ].IsPipeEmpty() ) + { + bHaveTasks = true; + break; + } + } + } +} + +void TaskScheduler::WaitforAllAndShutdown() +{ + WaitforAll(); + StopThreads(true); + delete[] m_pPipesPerThread; + m_pPipesPerThread = 0; +} + +uint32_t TaskScheduler::GetNumTaskThreads() const +{ + return m_NumThreads; +} + +TaskScheduler::TaskScheduler() + : m_pPipesPerThread(NULL) + , m_NumThreads(0) + , m_pThreadNumStore(NULL) + , m_pThreadIDs(NULL) + , m_bRunning(false) + , m_NumThreadsRunning(0) + , m_NumThreadsWaiting(0) + , m_NumPartitions(0) + , m_bHaveThreads(false) +{ + memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks)); +} + +TaskScheduler::~TaskScheduler() +{ + StopThreads( true ); // Stops threads, waiting for them. + + delete[] m_pPipesPerThread; + m_pPipesPerThread = 0; +} + +void TaskScheduler::Initialize( uint32_t numThreads_ ) +{ + assert( numThreads_ ); + StopThreads( true ); // Stops threads, waiting for them. + delete[] m_pPipesPerThread; + + m_NumThreads = numThreads_; + + m_pPipesPerThread = new TaskPipe[ m_NumThreads ]; + + StartThreads(); +} + +void TaskScheduler::Initialize() +{ + Initialize( GetNumHardwareThreads() ); +} \ No newline at end of file diff --git a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h new file mode 100644 index 00000000..74f9376c --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.h @@ -0,0 +1,177 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#pragma once + +#include +#include "Threads.h" + +namespace enki +{ + + struct TaskSetPartition + { + uint32_t start; + uint32_t end; + }; + + class TaskScheduler; + class TaskPipe; + struct ThreadArgs; + struct SubTaskSet; + + // Subclass ITaskSet to create tasks. + // TaskSets can be re-used, but check + class ITaskSet + { + public: + ITaskSet() + : m_SetSize(1) + , m_MinRange(1) + , m_RunningCount(0) + , m_RangeToRun(1) + {} + + ITaskSet( uint32_t setSize_ ) + : m_SetSize( setSize_ ) + , m_MinRange(1) + , m_RunningCount(0) + , m_RangeToRun(1) + {} + + ITaskSet( uint32_t setSize_, uint32_t minRange_ ) + : m_SetSize( setSize_ ) + , m_MinRange( minRange_ ) + , m_RunningCount(0) + , m_RangeToRun(minRange_) + {} + + // Execute range should be overloaded to process tasks. It will be called with a + // range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize; + // The range values should be mapped so that linearly processing them in order is cache friendly + // i.e. neighbouring values should be close together. + // threadnum should not be used for changing processing of data, it's intended purpose + // is to allow per-thread data buckets for output. + virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum ) = 0; + + // Size of set - usually the number of data items to be processed, see ExecuteRange. Defaults to 1 + uint32_t m_SetSize; + + // Minimum size of of TaskSetPartition range when splitting a task set into partitions. + // This should be set to a value which results in computation effort of at least 10k + // clock cycles to minimize tast scheduler overhead. + // NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple + // of m_MinRange. + // Also known as grain size in literature. + uint32_t m_MinRange; + + bool GetIsComplete() + { + return 0 == m_RunningCount; + } + private: + friend class TaskScheduler; + volatile int32_t m_RunningCount; + uint32_t m_RangeToRun; + }; + + // TaskScheduler implements several callbacks intended for profilers + typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ ); + struct ProfilerCallbacks + { + ProfilerCallbackFunc threadStart; + ProfilerCallbackFunc threadStop; + ProfilerCallbackFunc waitStart; + ProfilerCallbackFunc waitStop; + }; + + class TaskScheduler + { + public: + TaskScheduler(); + ~TaskScheduler(); + + // Call either Initialize() or Initialize( numThreads_ ) before adding tasks. + + // Initialize() will create GetNumHardwareThreads()-1 threads, which is + // sufficient to fill the system when including the main thread. + // Initialize can be called multiple times - it will wait for completion + // before re-initializing. + void Initialize(); + + // Initialize( numThreads_ ) - numThreads_ (must be > 0) + // will create numThreads_-1 threads, as thread 0 is + // the thread on which the initialize was called. + void Initialize( uint32_t numThreads_ ); + + + // Adds the TaskSet to pipe and returns if the pipe is not full. + // If the pipe is full, pTaskSet is run. + // should only be called from main thread, or within a task + void AddTaskSetToPipe( ITaskSet* pTaskSet ); + + // Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete(); + // should only be called from thread which created the taskscheduler , or within a task + // if called with 0 it will try to run tasks, and return if none available. + void WaitforTaskSet( const ITaskSet* pTaskSet ); + + // Waits for all task sets to complete - not guaranteed to work unless we know we + // are in a situation where tasks aren't being continuosly added. + void WaitforAll(); + + // Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we + // are in a situation where tasks aren't being continuosly added. + void WaitforAllAndShutdown(); + + // Returns the number of threads created for running tasks + 1 + // to account for the main thread. + uint32_t GetNumTaskThreads() const; + + // Returns the ProfilerCallbacks structure so that it can be modified to + // set the callbacks. + ProfilerCallbacks* GetProfilerCallbacks(); + + private: + static THREADFUNC_DECL TaskingThreadFunction( void* pArgs ); + void WaitForTasks( uint32_t threadNum ); + bool TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ ); + void StartThreads(); + void StopThreads( bool bWait_ ); + void SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_, + uint32_t rangeToSplit_, int32_t runningCountOffset_ ); + void WakeThreads(); + + TaskPipe* m_pPipesPerThread; + + uint32_t m_NumThreads; + ThreadArgs* m_pThreadNumStore; + threadid_t* m_pThreadIDs; + volatile bool m_bRunning; + volatile int32_t m_NumThreadsRunning; + volatile int32_t m_NumThreadsWaiting; + uint32_t m_NumPartitions; + uint32_t m_NumInitialPartitions; + semaphoreid_t m_NewTaskSemaphore; + bool m_bHaveThreads; + ProfilerCallbacks m_ProfilerCallbacks; + + TaskScheduler( const TaskScheduler& nocopy ); + TaskScheduler& operator=( const TaskScheduler& nocopy ); + }; + +} \ No newline at end of file diff --git a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp new file mode 100644 index 00000000..a7d25b49 --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.cpp @@ -0,0 +1,122 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#include "TaskScheduler_c.h" +#include "TaskScheduler.h" + +#include + +using namespace enki; + +struct enkiTaskScheduler : TaskScheduler +{ +}; + +struct enkiTaskSet : ITaskSet +{ + enkiTaskSet( enkiTaskExecuteRange taskFun_ ) : taskFun(taskFun_), pArgs(NULL) {} + + virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum ) + { + taskFun( range.start, range.end, threadnum, pArgs ); + } + + enkiTaskExecuteRange taskFun; + void* pArgs; +}; + +enkiTaskScheduler* enkiNewTaskScheduler() +{ + enkiTaskScheduler* pETS = new enkiTaskScheduler(); + return pETS; +} + +void enkiInitTaskScheduler( enkiTaskScheduler* pETS_ ) +{ + pETS_->Initialize(); +} + +void enkiInitTaskSchedulerNumThreads( enkiTaskScheduler* pETS_, uint32_t numThreads_ ) +{ + pETS_->Initialize( numThreads_ ); +} + +void enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ ) +{ + delete pETS_; +} + +enkiTaskSet* enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_ ) +{ + return new enkiTaskSet( taskFunc_ ); +} + +void enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ ) +{ + delete pTaskSet_; +} + +void enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, void* pArgs_, uint32_t setSize_ ) +{ + assert( pTaskSet_ ); + assert( pTaskSet_->taskFun ); + + pTaskSet_->m_SetSize = setSize_; + pTaskSet_->pArgs = pArgs_; + pETS_->AddTaskSetToPipe( pTaskSet_ ); +} + +void enkiAddTaskSetToPipeMinRange(enkiTaskScheduler * pETS_, enkiTaskSet * pTaskSet_, void * pArgs_, uint32_t setSize_, uint32_t minRange_) +{ + assert( pTaskSet_ ); + assert( pTaskSet_->taskFun ); + + pTaskSet_->m_SetSize = setSize_; + pTaskSet_->m_MinRange = minRange_; + pTaskSet_->pArgs = pArgs_; + pETS_->AddTaskSetToPipe( pTaskSet_ ); +} + +int enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ ) +{ + assert( pTaskSet_ ); + return ( pTaskSet_->GetIsComplete() ) ? 1 : 0; +} + +void enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ ) +{ + pETS_->WaitforTaskSet( pTaskSet_ ); +} + +void enkiWaitForAll( enkiTaskScheduler* pETS_ ) +{ + pETS_->WaitforAll(); +} + + +uint32_t enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ ) +{ + return pETS_->GetNumTaskThreads(); +} + +enkiProfilerCallbacks* enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ ) +{ + assert( sizeof(enkiProfilerCallbacks) == sizeof(enki::ProfilerCallbacks) ); + return (enkiProfilerCallbacks*)pETS_->GetProfilerCallbacks(); +} + diff --git a/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h new file mode 100644 index 00000000..912771cf --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/TaskScheduler_c.h @@ -0,0 +1,104 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct enkiTaskScheduler enkiTaskScheduler; +typedef struct enkiTaskSet enkiTaskSet; + +typedef void (* enkiTaskExecuteRange)( uint32_t start_, uint32_t end, uint32_t threadnum_, void* pArgs_ ); + + +// Create a new task scheduler +enkiTaskScheduler* enkiNewTaskScheduler(); + +// Initialize task scheduler - will create GetNumHardwareThreads()-1 threads, which is +// sufficient to fill the system when including the main thread. +// Initialize can be called multiple times - it will wait for completion +// before re-initializing. +void enkiInitTaskScheduler( enkiTaskScheduler* pETS_ ); + +// Initialize a task scheduler with numThreads_ (must be > 0) +// will create numThreads_-1 threads, as thread 0 is +// the thread on which the initialize was called. +void enkiInitTaskSchedulerNumThreads( enkiTaskScheduler* pETS_, uint32_t numThreads_ ); + + +// Delete a task scheduler +void enkiDeleteTaskScheduler( enkiTaskScheduler* pETS_ ); + +// Create a task set. +enkiTaskSet* enkiCreateTaskSet( enkiTaskScheduler* pETS_, enkiTaskExecuteRange taskFunc_ ); + +// Delete a task set. +void enkiDeleteTaskSet( enkiTaskSet* pTaskSet_ ); + +// Schedule the task +void enkiAddTaskSetToPipe( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, + void* pArgs_, uint32_t setSize_ ); + +// Schedule the task with a minimum range. +// This should be set to a value which results in computation effort of at least 10k +// clock cycles to minimize tast scheduler overhead. +// NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple +// of m_MinRange. +// Also known as grain size in literature. +void enkiAddTaskSetToPipeMinRange( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_, + void* pArgs_, uint32_t setSize_, uint32_t minRange_ ); + + +// Check if TaskSet is complete. Doesn't wait. Returns 1 if complete, 0 if not. +int enkiIsTaskSetComplete( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ ); + + +// Wait for a given task. +// should only be called from thread which created the taskscheduler , or within a task +// if called with 0 it will try to run tasks, and return if none available. +void enkiWaitForTaskSet( enkiTaskScheduler* pETS_, enkiTaskSet* pTaskSet_ ); + + +// Waits for all task sets to complete - not guaranteed to work unless we know we +// are in a situation where tasks aren't being continuosly added. +void enkiWaitForAll( enkiTaskScheduler* pETS_ ); + + +// get number of threads +uint32_t enkiGetNumTaskThreads( enkiTaskScheduler* pETS_ ); + +// TaskScheduler implements several callbacks intended for profilers +typedef void (*enkiProfilerCallbackFunc)( uint32_t threadnum_ ); +struct enkiProfilerCallbacks +{ + enkiProfilerCallbackFunc threadStart; + enkiProfilerCallbackFunc threadStop; + enkiProfilerCallbackFunc waitStart; + enkiProfilerCallbackFunc waitStop; +}; + +// Get the callback structure so it can be set +struct enkiProfilerCallbacks* enkiGetProfilerCallbacks( enkiTaskScheduler* pETS_ ); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/examples/ToyPathTracer/Source/enkiTS/Threads.h b/examples/ToyPathTracer/Source/enkiTS/Threads.h new file mode 100644 index 00000000..575a2244 --- /dev/null +++ b/examples/ToyPathTracer/Source/enkiTS/Threads.h @@ -0,0 +1,210 @@ +// Copyright (c) 2013 Doug Binks +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. + +#pragma once + +#include +#include + +#ifdef _WIN32 + + #include "Atomics.h" + + #define WIN32_LEAN_AND_MEAN + #include + + #define THREADFUNC_DECL DWORD WINAPI + #define THREAD_LOCAL __declspec( thread ) + +namespace enki +{ + typedef HANDLE threadid_t; + + // declare the thread start function as: + // THREADFUNC_DECL MyThreadStart( void* pArg ); + inline bool ThreadCreate( threadid_t* returnid, DWORD ( WINAPI *StartFunc) (void* ), void* pArg ) + { + // posix equiv pthread_create + DWORD threadid; + *returnid = CreateThread( 0, 0, StartFunc, pArg, 0, &threadid ); + return *returnid != NULL; + } + + inline bool ThreadTerminate( threadid_t threadid ) + { + // posix equiv pthread_cancel + return CloseHandle( threadid ) == 0; + } + + inline uint32_t GetNumHardwareThreads() + { + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + return sysInfo.dwNumberOfProcessors; + } +} + +#else // posix + + #include + #include + #define THREADFUNC_DECL void* + #define THREAD_LOCAL __thread + +namespace enki +{ + typedef pthread_t threadid_t; + + // declare the thread start function as: + // THREADFUNC_DECL MyThreadStart( void* pArg ); + inline bool ThreadCreate( threadid_t* returnid, void* ( *StartFunc) (void* ), void* pArg ) + { + // posix equiv pthread_create + int32_t retval = pthread_create( returnid, NULL, StartFunc, pArg ); + + return retval == 0; + } + + inline bool ThreadTerminate( threadid_t threadid ) + { + // posix equiv pthread_cancel + return pthread_cancel( threadid ) == 0; + } + + inline uint32_t GetNumHardwareThreads() + { + return (uint32_t)sysconf( _SC_NPROCESSORS_ONLN ); + } +} + +#endif // posix + + +// Semaphore implementation +#ifdef _WIN32 + +namespace enki +{ + struct semaphoreid_t + { + HANDLE sem; + }; + + inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) + { + semaphoreid.sem = CreateSemaphore(NULL, 0, MAXLONG, NULL ); + } + + inline void SemaphoreClose( semaphoreid_t& semaphoreid ) + { + CloseHandle( semaphoreid.sem ); + } + + inline void SemaphoreWait( semaphoreid_t& semaphoreid ) + { + DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE ); + + assert( retval != WAIT_FAILED ); + } + + inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) + { + if( countWaiting ) + { + ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL ); + } + } +} +#elif defined(__MACH__) + +// OS X does not have POSIX semaphores +// see https://developer.apple.com/library/content/documentation/Darwin/Conceptual/KernelProgramming/synchronization/synchronization.html +#include + +namespace enki +{ + + struct semaphoreid_t + { + semaphore_t sem; + }; + + inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) + { + semaphore_create( mach_task_self(), &semaphoreid.sem, SYNC_POLICY_FIFO, 0 ); + } + + inline void SemaphoreClose( semaphoreid_t& semaphoreid ) + { + semaphore_destroy( mach_task_self(), semaphoreid.sem ); + } + + inline void SemaphoreWait( semaphoreid_t& semaphoreid ) + { + semaphore_wait( semaphoreid.sem ); + } + + inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) + { + while( countWaiting-- > 0 ) + { + semaphore_signal( semaphoreid.sem ); + } + } +} + +#else // POSIX + +#include + +namespace enki +{ + + struct semaphoreid_t + { + sem_t sem; + }; + + inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) + { + int err = sem_init( &semaphoreid.sem, 0, 0 ); + assert( err == 0 ); + } + + inline void SemaphoreClose( semaphoreid_t& semaphoreid ) + { + sem_destroy( &semaphoreid.sem ); + } + + inline void SemaphoreWait( semaphoreid_t& semaphoreid ) + { + int err = sem_wait( &semaphoreid.sem ); + assert( err == 0 ); + } + + inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) + { + while( countWaiting-- > 0 ) + { + sem_post( &semaphoreid.sem ); + } + } +} +#endif + + diff --git a/examples/ToyPathTracer/Windows/ComputeShader.hlsl b/examples/ToyPathTracer/Windows/ComputeShader.hlsl new file mode 100644 index 00000000..ed6ef03f --- /dev/null +++ b/examples/ToyPathTracer/Windows/ComputeShader.hlsl @@ -0,0 +1,395 @@ +#include "../Source/Config.h" + +inline uint RNG(inout uint state) +{ + uint x = state; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 15; + state = x; + return x; +} + +float RandomFloat01(inout uint state) +{ + return (RNG(state) & 0xFFFFFF) / 16777216.0f; +} + +float3 RandomInUnitDisk(inout uint state) +{ + float a = RandomFloat01(state) * 2.0f * 3.1415926f; + float2 xy = float2(cos(a), sin(a)); + xy *= sqrt(RandomFloat01(state)); + return float3(xy, 0); +} +float3 RandomInUnitSphere(inout uint state) +{ + float z = RandomFloat01(state) * 2.0f - 1.0f; + float t = RandomFloat01(state) * 2.0f * 3.1415926f; + float r = sqrt(max(0.0, 1.0f - z * z)); + float x = r * cos(t); + float y = r * sin(t); + float3 res = float3(x, y, z); + res *= pow(RandomFloat01(state), 1.0 / 3.0); + return res; +} +float3 RandomUnitVector(inout uint state) +{ + float z = RandomFloat01(state) * 2.0f - 1.0f; + float a = RandomFloat01(state) * 2.0f * 3.1415926f; + float r = sqrt(1.0f - z * z); + float x = r * cos(a); + float y = r * sin(a); + return float3(x, y, z); +} + + + +struct Ray +{ + float3 orig; + float3 dir; +}; +Ray MakeRay(float3 orig_, float3 dir_) { Ray r; r.orig = orig_; r.dir = dir_; return r; } +float3 RayPointAt(Ray r, float t) { return r.orig + r.dir * t; } + + +inline bool refract(float3 v, float3 n, float nint, out float3 outRefracted) +{ + float dt = dot(v, n); + float discr = 1.0f - nint * nint*(1 - dt * dt); + if (discr > 0) + { + outRefracted = nint * (v - n * dt) - n * sqrt(discr); + return true; + } + return false; +} +inline float schlick(float cosine, float ri) +{ + float r0 = (1 - ri) / (1 + ri); + r0 = r0 * r0; + // note: saturate to guard against possible tiny negative numbers + return r0 + (1 - r0)*pow(saturate(1 - cosine), 5); +} + +struct Hit +{ + float3 pos; + float3 normal; + float t; +}; + +struct Sphere +{ + float3 center; + float radius; + float invRadius; +}; + +#define MatLambert 0 +#define MatMetal 1 +#define MatDielectric 2 + +struct Material +{ + int type; + float3 albedo; + float3 emissive; + float roughness; + float ri; +}; + +groupshared Sphere s_GroupSpheres[kCSMaxObjects]; +groupshared Material s_GroupMaterials[kCSMaxObjects]; +groupshared int s_GroupEmissives[kCSMaxObjects]; + + +struct Camera +{ + float3 origin; + float3 lowerLeftCorner; + float3 horizontal; + float3 vertical; + float3 u, v, w; + float lensRadius; +}; + +Ray CameraGetRay(Camera cam, float s, float t, inout uint state) +{ + float3 rd = cam.lensRadius * RandomInUnitDisk(state); + float3 offset = cam.u * rd.x + cam.v * rd.y; + return MakeRay(cam.origin + offset, normalize(cam.lowerLeftCorner + s * cam.horizontal + t * cam.vertical - cam.origin - offset)); +} + + +int HitSpheres(Ray r, int sphereCount, float tMin, float tMax, inout Hit outHit) +{ + float hitT = tMax; + int id = -1; + for (int i = 0; i < sphereCount; ++i) + { + Sphere s = s_GroupSpheres[i]; + float3 co = s.center - r.orig; + float nb = dot(co, r.dir); + float c = dot(co, co) - s.radius*s.radius; + float discr = nb * nb - c; + if (discr > 0) + { + float discrSq = sqrt(discr); + + // Try earlier t + float t = nb - discrSq; + if (t <= tMin) // before min, try later t! + t = nb + discrSq; + + if (t > tMin && t < hitT) + { + id = i; + hitT = t; + } + } + } + + if (id != -1) + { + outHit.pos = RayPointAt(r, hitT); + outHit.normal = (outHit.pos - s_GroupSpheres[id].center) * s_GroupSpheres[id].invRadius; + outHit.t = hitT; + } + return id; +} + +struct Params +{ + Camera cam; + int sphereCount; + int screenWidth; + int screenHeight; + int frames; + float invWidth; + float invHeight; + float lerpFac; + int emissiveCount; +}; + + +#define kMinT 0.001f +#define kMaxT 1.0e7f +#define kMaxDepth 10 + + +static int HitWorld(int sphereCount, Ray r, float tMin, float tMax, inout Hit outHit) +{ + return HitSpheres(r, sphereCount, tMin, tMax, outHit); +} + + +static bool Scatter(int sphereCount, int emissiveCount, int matID, Ray r_in, Hit rec, out float3 attenuation, out Ray scattered, out float3 outLightE, inout int inoutRayCount, inout uint state) +{ + outLightE = float3(0, 0, 0); + Material mat = s_GroupMaterials[matID]; + if (mat.type == MatLambert) + { + // random point on unit sphere that is tangent to the hit point + float3 target = rec.pos + rec.normal + RandomUnitVector(state); + scattered = MakeRay(rec.pos, normalize(target - rec.pos)); + attenuation = mat.albedo; + + // sample lights +#if DO_LIGHT_SAMPLING + for (int j = 0; j < emissiveCount; ++j) + { + int i = s_GroupEmissives[j]; + if (matID == i) + continue; // skip self + Material smat = s_GroupMaterials[i]; + Sphere s = s_GroupSpheres[i]; + + // create a random direction towards sphere + // coord system for sampling: sw, su, sv + float3 sw = normalize(s.center - rec.pos); + float3 su = normalize(cross(abs(sw.x)>0.01f ? float3(0, 1, 0) : float3(1, 0, 0), sw)); + float3 sv = cross(sw, su); + // sample sphere by solid angle + float cosAMax = sqrt(1.0f - s.radius*s.radius / dot(rec.pos - s.center, rec.pos - s.center)); + float eps1 = RandomFloat01(state), eps2 = RandomFloat01(state); + float cosA = 1.0f - eps1 + eps1 * cosAMax; + float sinA = sqrt(1.0f - cosA * cosA); + float phi = 2 * 3.1415926 * eps2; + float3 l = su * cos(phi) * sinA + sv * sin(phi) * sinA + sw * cosA; + + // shoot shadow ray + Hit lightHit; + ++inoutRayCount; + int hitID = HitWorld(sphereCount, MakeRay(rec.pos, l), kMinT, kMaxT, lightHit); + if (hitID == i) + { + float omega = 2 * 3.1415926 * (1 - cosAMax); + + float3 rdir = r_in.dir; + float3 nl = dot(rec.normal, rdir) < 0 ? rec.normal : -rec.normal; + outLightE += (mat.albedo * smat.emissive) * (max(0.0f, dot(l, nl)) * omega / 3.1415926); + } + } +#endif + return true; + } + else if (mat.type == MatMetal) + { + float3 refl = reflect(r_in.dir, rec.normal); + // reflected ray, and random inside of sphere based on roughness + float roughness = mat.roughness; +#if DO_MITSUBA_COMPARE + roughness = 0; // until we get better BRDF for metals +#endif + scattered = MakeRay(rec.pos, normalize(refl + roughness*RandomInUnitSphere(state))); + attenuation = mat.albedo; + return dot(scattered.dir, rec.normal) > 0; + } + else if (mat.type == MatDielectric) + { + float3 outwardN; + float3 rdir = r_in.dir; + float3 refl = reflect(rdir, rec.normal); + float nint; + attenuation = float3(1, 1, 1); + float3 refr; + float reflProb; + float cosine; + if (dot(rdir, rec.normal) > 0) + { + outwardN = -rec.normal; + nint = mat.ri; + cosine = mat.ri * dot(rdir, rec.normal); + } + else + { + outwardN = rec.normal; + nint = 1.0f / mat.ri; + cosine = -dot(rdir, rec.normal); + } + if (refract(rdir, outwardN, nint, refr)) + { + reflProb = schlick(cosine, mat.ri); + } + else + { + reflProb = 1; + } + if (RandomFloat01(state) < reflProb) + scattered = MakeRay(rec.pos, normalize(refl)); + else + scattered = MakeRay(rec.pos, normalize(refr)); + } + else + { + attenuation = float3(1, 0, 1); + scattered = MakeRay(float3(0,0,0), float3(0, 0, 1)); + return false; + } + return true; +} + +static float3 Trace(int sphereCount, int emissiveCount, Ray r, inout int inoutRayCount, inout uint state) +{ + float3 col = 0; + float3 curAtten = 1; + bool doMaterialE = true; + // GPUs don't support recursion, so do tracing iterations in a loop up to max depth + for (int depth = 0; depth < kMaxDepth; ++depth) + { + Hit rec; + ++inoutRayCount; + int id = HitWorld(sphereCount, r, kMinT, kMaxT, rec); + if (id >= 0) + { + Ray scattered; + float3 attenuation; + float3 lightE; + Material mat = s_GroupMaterials[id]; + float3 matE = mat.emissive; + if (Scatter(sphereCount, emissiveCount, id, r, rec, attenuation, scattered, lightE, inoutRayCount, state)) + { +#if DO_LIGHT_SAMPLING + if (!doMaterialE) matE = 0; + doMaterialE = (mat.type != MatLambert); +#endif + col += curAtten * (matE + lightE); + curAtten *= attenuation; + r = scattered; + } + else + { + col += curAtten * matE; + break; + } + } + else + { + // sky +#if DO_MITSUBA_COMPARE + col += curAtten * float3(0.15f, 0.21f, 0.3f); // easier compare with Mitsuba's constant environment light +#else + float3 unitDir = r.dir; + float t = 0.5f*(unitDir.y + 1.0f); + float3 skyCol = ((1.0f - t)*float3(1.0f, 1.0f, 1.0f) + t * float3(0.5f, 0.7f, 1.0f)) * 0.3f; + col += curAtten * skyCol; +#endif + break; + } + } + return col; +} + +Texture2D srcImage : register(t0); +RWTexture2D dstImage : register(u0); +StructuredBuffer g_Spheres : register(t1); +StructuredBuffer g_Materials : register(t2); +StructuredBuffer g_Params : register(t3); +StructuredBuffer g_Emissives : register(t4); +RWByteAddressBuffer g_OutRayCount : register(u1); + +[numthreads(kCSGroupSizeX, kCSGroupSizeY, 1)] +void main(uint3 gid : SV_DispatchThreadID, uint3 tid : SV_GroupThreadID) +{ + // First, move scene data (spheres, materials, emissive indices) into group shared + // memory. Do this in parallel; each thread in group copies its own chunk of data. + uint threadID = tid.y * kCSGroupSizeX + tid.x; + uint groupSize = kCSGroupSizeX * kCSGroupSizeY; + uint objCount = g_Params[0].sphereCount; + uint myObjCount = (objCount + groupSize - 1) / groupSize; + uint myObjStart = threadID * myObjCount; + for (uint io = myObjStart; io < myObjStart + myObjCount; ++io) + { + if (io < objCount) + { + s_GroupSpheres[io] = g_Spheres[io]; + s_GroupMaterials[io] = g_Materials[io]; + } + if (io < g_Params[0].emissiveCount) + { + s_GroupEmissives[io] = g_Emissives[io]; + } + } + GroupMemoryBarrierWithGroupSync(); + + int rayCount = 0; + float3 col = 0; + Params params = g_Params[0]; + uint rngState = (gid.x * 1973 + gid.y * 9277 + params.frames * 26699) | 1; + for (int s = 0; s < DO_SAMPLES_PER_PIXEL; s++) + { + float u = float(gid.x + RandomFloat01(rngState)) * params.invWidth; + float v = float(gid.y + RandomFloat01(rngState)) * params.invHeight; + Ray r = CameraGetRay(params.cam, u, v, rngState); + col += Trace(params.sphereCount, params.emissiveCount, r, rayCount, rngState); + } + col *= 1.0f / float(DO_SAMPLES_PER_PIXEL); + + float3 prev = srcImage.Load(int3(gid.xy,0)).rgb; + col = lerp(col, prev, params.lerpFac); + dstImage[gid.xy] = float4(col, 1); + + g_OutRayCount.InterlockedAdd(0, rayCount); +} diff --git a/examples/ToyPathTracer/Windows/PixelShader.hlsl b/examples/ToyPathTracer/Windows/PixelShader.hlsl new file mode 100644 index 00000000..41864ccc --- /dev/null +++ b/examples/ToyPathTracer/Windows/PixelShader.hlsl @@ -0,0 +1,15 @@ +float3 LinearToSRGB(float3 rgb) +{ + rgb = max(rgb, float3(0, 0, 0)); + return max(1.055 * pow(rgb, 0.416666667) - 0.055, 0.0); +} + +Texture2D tex : register(t0); +SamplerState smp : register(s0); + +float4 main(float2 uv : TEXCOORD0) : SV_Target +{ + float3 col = tex.Sample(smp, uv).rgb; + col = LinearToSRGB(col); + return float4(col, 1.0f); +} diff --git a/examples/ToyPathTracer/Windows/TestCpu.sln b/examples/ToyPathTracer/Windows/TestCpu.sln new file mode 100644 index 00000000..a05d0ce6 --- /dev/null +++ b/examples/ToyPathTracer/Windows/TestCpu.sln @@ -0,0 +1,31 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.27130.2036 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TestCpu", "TestCpu.vcxproj", "{4F84B756-87F5-4B92-827B-DA087DAE1900}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x64.ActiveCfg = Debug|x64 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x64.Build.0 = Debug|x64 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x86.ActiveCfg = Debug|Win32 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Debug|x86.Build.0 = Debug|Win32 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x64.ActiveCfg = Release|x64 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x64.Build.0 = Release|x64 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x86.ActiveCfg = Release|Win32 + {4F84B756-87F5-4B92-827B-DA087DAE1900}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {067FB780-37B8-465E-AD7E-E7B238B9C04F} + EndGlobalSection +EndGlobal diff --git a/examples/ToyPathTracer/Windows/TestCpu.vcxproj b/examples/ToyPathTracer/Windows/TestCpu.vcxproj new file mode 100644 index 00000000..9f2aa26e --- /dev/null +++ b/examples/ToyPathTracer/Windows/TestCpu.vcxproj @@ -0,0 +1,242 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {4F84B756-87F5-4B92-827B-DA087DAE1900} + Win32Proj + TestCpu + 10.0 + + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + Application + true + v142 + Unicode + + + Application + false + v142 + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Level3 + Disabled + true + WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + VectorCall + Fast + + + Windows + true + d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies) + + + + + Level3 + Disabled + true + _DEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + VectorCall + Fast + + + Windows + true + d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies) + + + + + Level3 + MaxSpeed + true + true + WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + false + MultiThreaded + false + VectorCall + Fast + + + Windows + true + true + true + d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies) + + + + + Level3 + MaxSpeed + true + true + NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + false + MultiThreaded + false + VectorCall + Fast + + + Windows + true + true + true + d3d11.lib;kernel32.lib;user32.lib;gdi32.lib;%(AdditionalDependencies) + + + + + + + + + + + + + + + + + + + + + + + + + + + Compute + 5.0 + Compute + 5.0 + Compute + 5.0 + Compute + 5.0 + g_CSBytecode + CompiledComputeShader.h + g_CSBytecode + CompiledComputeShader.h + g_CSBytecode + CompiledComputeShader.h + g_CSBytecode + CompiledComputeShader.h + + + Pixel + Pixel + Pixel + Pixel + 5.0 + 5.0 + 5.0 + 5.0 + CompiledPixelShader.h + CompiledPixelShader.h + CompiledPixelShader.h + CompiledPixelShader.h + g_PSBytecode + g_PSBytecode + g_PSBytecode + g_PSBytecode + + + Vertex + Vertex + Vertex + Vertex + 5.0 + 5.0 + 5.0 + 5.0 + CompiledVertexShader.h + CompiledVertexShader.h + CompiledVertexShader.h + CompiledVertexShader.h + g_VSBytecode + g_VSBytecode + g_VSBytecode + g_VSBytecode + + + + + + \ No newline at end of file diff --git a/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters b/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters new file mode 100644 index 00000000..bcf284f1 --- /dev/null +++ b/examples/ToyPathTracer/Windows/TestCpu.vcxproj.filters @@ -0,0 +1,66 @@ + + + + + + Source + + + Source\enkiTS + + + Source\enkiTS + + + Source + + + + + {5f19f217-c1c7-4eeb-be61-8b986fee9375} + + + {38c448a8-1dcc-4116-9410-a9f8d068caff} + + + + + Source + + + Source + + + Source\enkiTS + + + Source\enkiTS + + + Source\enkiTS + + + Source\enkiTS + + + Source\enkiTS + + + Source + + + Source + + + Source + + + + + + + + + + + \ No newline at end of file diff --git a/examples/ToyPathTracer/Windows/TestWin.cpp b/examples/ToyPathTracer/Windows/TestWin.cpp new file mode 100644 index 00000000..ea5aa3e9 --- /dev/null +++ b/examples/ToyPathTracer/Windows/TestWin.cpp @@ -0,0 +1,540 @@ +#include +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include + +#include +#include +#include +#include + +#include "../Source/Config.h" +#include "../Source/Maths.h" +#include "../Source/Test.h" +#include "CompiledVertexShader.h" +#include "CompiledPixelShader.h" + +static HINSTANCE g_HInstance; +static HWND g_Wnd; + +ATOM MyRegisterClass(HINSTANCE hInstance); +BOOL InitInstance(HINSTANCE, int); +LRESULT CALLBACK WndProc(HWND, UINT, WPARAM, LPARAM); +INT_PTR CALLBACK About(HWND, UINT, WPARAM, LPARAM); + +static HRESULT InitD3DDevice(); +static void ShutdownD3DDevice(); +static void RenderFrame(); + +static float* g_Backbuffer; + +static D3D_FEATURE_LEVEL g_D3D11FeatureLevel = D3D_FEATURE_LEVEL_11_0; +static ID3D11Device* g_D3D11Device = nullptr; +static ID3D11DeviceContext* g_D3D11Ctx = nullptr; +static IDXGISwapChain* g_D3D11SwapChain = nullptr; +static ID3D11RenderTargetView* g_D3D11RenderTarget = nullptr; +static ID3D11VertexShader* g_VertexShader; +static ID3D11PixelShader* g_PixelShader; +static ID3D11Texture2D *g_BackbufferTexture, *g_BackbufferTexture2; +static ID3D11ShaderResourceView *g_BackbufferSRV, *g_BackbufferSRV2; +static ID3D11UnorderedAccessView *g_BackbufferUAV, *g_BackbufferUAV2; +static ID3D11SamplerState* g_SamplerLinear; +static ID3D11RasterizerState* g_RasterState; +static int g_BackbufferIndex; + + +#if DO_COMPUTE_GPU +#include "CompiledComputeShader.h" +struct ComputeParams +{ + Camera cam; + int sphereCount; + int screenWidth; + int screenHeight; + int frames; + float invWidth; + float invHeight; + float lerpFac; + int emissiveCount; +}; +static ID3D11ComputeShader* g_ComputeShader; +static ID3D11Buffer* g_DataSpheres; static ID3D11ShaderResourceView* g_SRVSpheres; +static ID3D11Buffer* g_DataMaterials; static ID3D11ShaderResourceView* g_SRVMaterials; +static ID3D11Buffer* g_DataParams; static ID3D11ShaderResourceView* g_SRVParams; +static ID3D11Buffer* g_DataEmissives; static ID3D11ShaderResourceView* g_SRVEmissives; +static ID3D11Buffer* g_DataCounter; static ID3D11UnorderedAccessView* g_UAVCounter; +static int g_SphereCount, g_ObjSize, g_MatSize; +static ID3D11Query *g_QueryBegin, *g_QueryEnd, *g_QueryDisjoint; +#endif // #if DO_COMPUTE_GPU + +int APIENTRY wWinMain(_In_ HINSTANCE hInstance, _In_opt_ HINSTANCE, _In_ LPWSTR, _In_ int nCmdShow) +{ + g_Backbuffer = new float[kBackbufferWidth * kBackbufferHeight * 4]; + memset(g_Backbuffer, 0, kBackbufferWidth * kBackbufferHeight * 4 * sizeof(g_Backbuffer[0])); + + InitializeTest(); + + MyRegisterClass(hInstance); + if (!InitInstance (hInstance, nCmdShow)) + { + return FALSE; + } + + if (FAILED(InitD3DDevice())) + { + ShutdownD3DDevice(); + return 0; + } + + g_D3D11Device->CreateVertexShader(g_VSBytecode, ARRAYSIZE(g_VSBytecode), NULL, &g_VertexShader); + g_D3D11Device->CreatePixelShader(g_PSBytecode, ARRAYSIZE(g_PSBytecode), NULL, &g_PixelShader); +#if DO_COMPUTE_GPU + g_D3D11Device->CreateComputeShader(g_CSBytecode, ARRAYSIZE(g_CSBytecode), NULL, &g_ComputeShader); +#endif + + D3D11_TEXTURE2D_DESC texDesc = {}; + texDesc.Width = kBackbufferWidth; + texDesc.Height = kBackbufferHeight; + texDesc.MipLevels = 1; + texDesc.ArraySize = 1; + texDesc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; + texDesc.SampleDesc.Count = 1; + texDesc.SampleDesc.Quality = 0; +#if DO_COMPUTE_GPU + texDesc.Usage = D3D11_USAGE_DEFAULT; + texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS; + texDesc.CPUAccessFlags = 0; +#else + texDesc.Usage = D3D11_USAGE_DYNAMIC; + texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; +#endif + texDesc.MiscFlags = 0; + g_D3D11Device->CreateTexture2D(&texDesc, NULL, &g_BackbufferTexture); + g_D3D11Device->CreateTexture2D(&texDesc, NULL, &g_BackbufferTexture2); + + D3D11_SHADER_RESOURCE_VIEW_DESC srvDesc = {}; + srvDesc.Format = texDesc.Format; + srvDesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srvDesc.Texture2D.MipLevels = 1; + srvDesc.Texture2D.MostDetailedMip = 0; + g_D3D11Device->CreateShaderResourceView(g_BackbufferTexture, &srvDesc, &g_BackbufferSRV); + g_D3D11Device->CreateShaderResourceView(g_BackbufferTexture2, &srvDesc, &g_BackbufferSRV2); + + D3D11_SAMPLER_DESC smpDesc = {}; + smpDesc.Filter = D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT; + smpDesc.AddressU = smpDesc.AddressV = smpDesc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + g_D3D11Device->CreateSamplerState(&smpDesc, &g_SamplerLinear); + + D3D11_RASTERIZER_DESC rasterDesc = {}; + rasterDesc.FillMode = D3D11_FILL_SOLID; + rasterDesc.CullMode = D3D11_CULL_NONE; + g_D3D11Device->CreateRasterizerState(&rasterDesc, &g_RasterState); + +#if DO_COMPUTE_GPU + D3D11_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; + + int camSize; + GetObjectCount(g_SphereCount, g_ObjSize, g_MatSize, camSize); + assert(g_ObjSize == 20); + assert(g_MatSize == 36); + assert(camSize == 88); + D3D11_BUFFER_DESC bdesc = {}; + bdesc.ByteWidth = g_SphereCount * g_ObjSize; + bdesc.Usage = D3D11_USAGE_DEFAULT; + bdesc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + bdesc.CPUAccessFlags = 0; + bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + bdesc.StructureByteStride = g_ObjSize; + g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataSpheres); + srvDesc.Format = DXGI_FORMAT_UNKNOWN; + srvDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + srvDesc.Buffer.FirstElement = 0; + srvDesc.Buffer.NumElements = g_SphereCount; + g_D3D11Device->CreateShaderResourceView(g_DataSpheres, &srvDesc, &g_SRVSpheres); + + bdesc.ByteWidth = g_SphereCount * g_MatSize; + bdesc.StructureByteStride = g_MatSize; + g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataMaterials); + srvDesc.Buffer.NumElements = g_SphereCount; + g_D3D11Device->CreateShaderResourceView(g_DataMaterials, &srvDesc, &g_SRVMaterials); + + bdesc.ByteWidth = sizeof(ComputeParams); + bdesc.StructureByteStride = sizeof(ComputeParams); + g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataParams); + srvDesc.Buffer.NumElements = 1; + g_D3D11Device->CreateShaderResourceView(g_DataParams, &srvDesc, &g_SRVParams); + + bdesc.ByteWidth = g_SphereCount * 4; + bdesc.StructureByteStride = 4; + g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataEmissives); + srvDesc.Buffer.NumElements = g_SphereCount; + g_D3D11Device->CreateShaderResourceView(g_DataEmissives, &srvDesc, &g_SRVEmissives); + + bdesc.ByteWidth = 4; + bdesc.BindFlags |= D3D11_BIND_UNORDERED_ACCESS; + bdesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; + bdesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; + g_D3D11Device->CreateBuffer(&bdesc, NULL, &g_DataCounter); + uavDesc.Format = DXGI_FORMAT_R32_TYPELESS; + uavDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + uavDesc.Buffer.FirstElement = 0; + uavDesc.Buffer.NumElements = 1; + uavDesc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; + g_D3D11Device->CreateUnorderedAccessView(g_DataCounter, &uavDesc, &g_UAVCounter); + + uavDesc.Format = DXGI_FORMAT_R32G32B32A32_FLOAT; + uavDesc.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D; + uavDesc.Texture2D.MipSlice = 0; + g_D3D11Device->CreateUnorderedAccessView(g_BackbufferTexture, &uavDesc, &g_BackbufferUAV); + g_D3D11Device->CreateUnorderedAccessView(g_BackbufferTexture2, &uavDesc, &g_BackbufferUAV2); + + D3D11_QUERY_DESC qDesc = {}; + qDesc.Query = D3D11_QUERY_TIMESTAMP; + g_D3D11Device->CreateQuery(&qDesc, &g_QueryBegin); + g_D3D11Device->CreateQuery(&qDesc, &g_QueryEnd); + qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT; + g_D3D11Device->CreateQuery(&qDesc, &g_QueryDisjoint); +#endif // #if DO_COMPUTE_GPU + + + // Main message loop + MSG msg = { 0 }; + while (msg.message != WM_QUIT) + { + if (PeekMessage(&msg, NULL, 0U, 0U, PM_REMOVE)) + { + TranslateMessage(&msg); + DispatchMessage(&msg); + } + else + { + RenderFrame(); + } + } + + ShutdownTest(); + ShutdownD3DDevice(); + + return (int) msg.wParam; +} + + +ATOM MyRegisterClass(HINSTANCE hInstance) +{ + WNDCLASSEXW wcex; + memset(&wcex, 0, sizeof(wcex)); + wcex.cbSize = sizeof(WNDCLASSEX); + wcex.style = CS_HREDRAW | CS_VREDRAW; + wcex.lpfnWndProc = WndProc; + wcex.cbClsExtra = 0; + wcex.cbWndExtra = 0; + wcex.hInstance = hInstance; + wcex.hCursor = LoadCursor(nullptr, IDC_ARROW); + wcex.hbrBackground = (HBRUSH)(COLOR_WINDOW+1); + wcex.lpszClassName = L"TestClass"; + return RegisterClassExW(&wcex); +} + +BOOL InitInstance(HINSTANCE hInstance, int nCmdShow) +{ + g_HInstance = hInstance; + RECT rc = { 0, 0, kBackbufferWidth, kBackbufferHeight }; + DWORD style = WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_MINIMIZEBOX; + AdjustWindowRect(&rc, style, FALSE); + HWND hWnd = CreateWindowW(L"TestClass", L"Test", style, CW_USEDEFAULT, CW_USEDEFAULT, rc.right-rc.left, rc.bottom-rc.top, nullptr, nullptr, hInstance, nullptr); + if (!hWnd) + return FALSE; + g_Wnd = hWnd; + ShowWindow(hWnd, nCmdShow); + UpdateWindow(hWnd); + return TRUE; +} + +static uint64_t s_Time; +static int s_Count; +static char s_Buffer[200]; +static unsigned s_Flags = kFlagProgressive; +static int s_FrameCount = 0; + + +static void RenderFrame() +{ + LARGE_INTEGER time1; + +#if DO_COMPUTE_GPU + QueryPerformanceCounter(&time1); + float t = float(clock()) / CLOCKS_PER_SEC; + UpdateTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, s_Flags); + + g_BackbufferIndex = 1 - g_BackbufferIndex; + void* dataSpheres = alloca(g_SphereCount * g_ObjSize); + void* dataMaterials = alloca(g_SphereCount * g_MatSize); + void* dataEmissives = alloca(g_SphereCount * 4); + ComputeParams dataParams; + GetSceneDesc(dataSpheres, dataMaterials, &dataParams.cam, dataEmissives, &dataParams.emissiveCount); + + dataParams.sphereCount = g_SphereCount; + dataParams.screenWidth = kBackbufferWidth; + dataParams.screenHeight = kBackbufferHeight; + dataParams.frames = s_FrameCount; + dataParams.invWidth = 1.0f / kBackbufferWidth; + dataParams.invHeight = 1.0f / kBackbufferHeight; + float lerpFac = float(s_FrameCount) / float(s_FrameCount + 1); + if (s_Flags & kFlagAnimate) + lerpFac *= DO_ANIMATE_SMOOTHING; + if (!(s_Flags & kFlagProgressive)) + lerpFac = 0; + dataParams.lerpFac = lerpFac; + + g_D3D11Ctx->UpdateSubresource(g_DataSpheres, 0, NULL, dataSpheres, 0, 0); + g_D3D11Ctx->UpdateSubresource(g_DataMaterials, 0, NULL, dataMaterials, 0, 0); + g_D3D11Ctx->UpdateSubresource(g_DataParams, 0, NULL, &dataParams, 0, 0); + g_D3D11Ctx->UpdateSubresource(g_DataEmissives, 0, NULL, dataEmissives, 0, 0); + + ID3D11ShaderResourceView* srvs[] = { + g_BackbufferIndex == 0 ? g_BackbufferSRV2 : g_BackbufferSRV, + g_SRVSpheres, + g_SRVMaterials, + g_SRVParams, + g_SRVEmissives + }; + g_D3D11Ctx->CSSetShaderResources(0, ARRAYSIZE(srvs), srvs); + ID3D11UnorderedAccessView* uavs[] = { + g_BackbufferIndex == 0 ? g_BackbufferUAV : g_BackbufferUAV2, + g_UAVCounter + }; + g_D3D11Ctx->CSSetUnorderedAccessViews(0, ARRAYSIZE(uavs), uavs, NULL); + g_D3D11Ctx->CSSetShader(g_ComputeShader, NULL, 0); + g_D3D11Ctx->Begin(g_QueryDisjoint); + g_D3D11Ctx->End(g_QueryBegin); + g_D3D11Ctx->Dispatch(kBackbufferWidth/kCSGroupSizeX, kBackbufferHeight/kCSGroupSizeY, 1); + g_D3D11Ctx->End(g_QueryEnd); + uavs[0] = NULL; + g_D3D11Ctx->CSSetUnorderedAccessViews(0, ARRAYSIZE(uavs), uavs, NULL); + ++s_FrameCount; + +#else + QueryPerformanceCounter(&time1); + float t = float(clock()) / CLOCKS_PER_SEC; + static size_t s_RayCounter = 0; + int rayCount; + UpdateTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, s_Flags); + DrawTest(t, s_FrameCount, kBackbufferWidth, kBackbufferHeight, g_Backbuffer, rayCount, s_Flags); + s_FrameCount++; + s_RayCounter += rayCount; + LARGE_INTEGER time2; + QueryPerformanceCounter(&time2); + uint64_t dt = time2.QuadPart - time1.QuadPart; + ++s_Count; + s_Time += dt; + if (s_Count > 10) + { + LARGE_INTEGER frequency; + QueryPerformanceFrequency(&frequency); + + double s = double(s_Time) / double(frequency.QuadPart) / s_Count; + sprintf_s(s_Buffer, sizeof(s_Buffer), "%.2fms (%.1f FPS) %.1fMrays/s %.2fMrays/frame frames %i\n", s * 1000.0f, 1.f / s, s_RayCounter / s_Count / s * 1.0e-6f, s_RayCounter / s_Count * 1.0e-6f, s_FrameCount); + SetWindowTextA(g_Wnd, s_Buffer); + OutputDebugStringA(s_Buffer); + s_Count = 0; + s_Time = 0; + s_RayCounter = 0; + } + + D3D11_MAPPED_SUBRESOURCE mapped; + g_D3D11Ctx->Map(g_BackbufferTexture, 0, D3D11_MAP_WRITE_DISCARD, 0, &mapped); + const uint8_t* src = (const uint8_t*)g_Backbuffer; + uint8_t* dst = (uint8_t*)mapped.pData; + for (int y = 0; y < kBackbufferHeight; ++y) + { + memcpy(dst, src, kBackbufferWidth * 16); + src += kBackbufferWidth * 16; + dst += mapped.RowPitch; + } + g_D3D11Ctx->Unmap(g_BackbufferTexture, 0); +#endif + + g_D3D11Ctx->VSSetShader(g_VertexShader, NULL, 0); + g_D3D11Ctx->PSSetShader(g_PixelShader, NULL, 0); + g_D3D11Ctx->PSSetShaderResources(0, 1, g_BackbufferIndex == 0 ? &g_BackbufferSRV : &g_BackbufferSRV2); + g_D3D11Ctx->PSSetSamplers(0, 1, &g_SamplerLinear); + g_D3D11Ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + g_D3D11Ctx->RSSetState(g_RasterState); + g_D3D11Ctx->Draw(3, 0); + g_D3D11SwapChain->Present(0, 0); + +#if DO_COMPUTE_GPU + g_D3D11Ctx->End(g_QueryDisjoint); + + // get GPU times + while (g_D3D11Ctx->GetData(g_QueryDisjoint, NULL, 0, 0) == S_FALSE) { Sleep(0); } + D3D10_QUERY_DATA_TIMESTAMP_DISJOINT tsDisjoint; + g_D3D11Ctx->GetData(g_QueryDisjoint, &tsDisjoint, sizeof(tsDisjoint), 0); + if (!tsDisjoint.Disjoint) + { + UINT64 tsBegin, tsEnd; + // Note: on some GPUs/drivers, even when the disjoint query above already said "yeah I have data", + // might still not return "I have data" for timestamp queries before it. + while (g_D3D11Ctx->GetData(g_QueryBegin, &tsBegin, sizeof(tsBegin), 0) == S_FALSE) { Sleep(0); } + while (g_D3D11Ctx->GetData(g_QueryEnd, &tsEnd, sizeof(tsEnd), 0) == S_FALSE) { Sleep(0); } + + float s = float(tsEnd - tsBegin) / float(tsDisjoint.Frequency); + + static uint64_t s_RayCounter; + D3D11_MAPPED_SUBRESOURCE mapped; + g_D3D11Ctx->Map(g_DataCounter, 0, D3D11_MAP_READ, 0, &mapped); + s_RayCounter += *(const int*)mapped.pData; + g_D3D11Ctx->Unmap(g_DataCounter, 0); + int zeroCount = 0; + g_D3D11Ctx->UpdateSubresource(g_DataCounter, 0, NULL, &zeroCount, 0, 0); + + static float s_Time; + ++s_Count; + s_Time += s; + if (s_Count > 150) + { + s = s_Time / s_Count; + sprintf_s(s_Buffer, sizeof(s_Buffer), "%.2fms (%.1f FPS) %.1fMrays/s %.2fMrays/frame frames %i\n", s * 1000.0f, 1.f / s, s_RayCounter / s_Count / s * 1.0e-6f, s_RayCounter / s_Count * 1.0e-6f, s_FrameCount); + SetWindowTextA(g_Wnd, s_Buffer); + s_Count = 0; + s_Time = 0; + s_RayCounter = 0; + } + + } +#endif // #if DO_COMPUTE_GPU +} + + +LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) +{ + switch (message) + { + case WM_PAINT: + { + PAINTSTRUCT ps; + HDC hdc = BeginPaint(hWnd, &ps); + EndPaint(hWnd, &ps); + } + break; + case WM_DESTROY: + PostQuitMessage(0); + break; + case WM_CHAR: + if (wParam == 'a') + s_Flags = s_Flags ^ kFlagAnimate; + if (wParam == 'p') + { + s_Flags = s_Flags ^ kFlagProgressive; + s_FrameCount = 0; + } + break; + default: + return DefWindowProc(hWnd, message, wParam, lParam); + } + return 0; +} + + +static HRESULT InitD3DDevice() +{ + HRESULT hr = S_OK; + + RECT rc; + GetClientRect(g_Wnd, &rc); + UINT width = rc.right - rc.left; + UINT height = rc.bottom - rc.top; + + UINT createDeviceFlags = 0; +#ifdef _DEBUG + createDeviceFlags |= D3D11_CREATE_DEVICE_DEBUG; +#endif + + D3D_FEATURE_LEVEL featureLevels[] = + { + D3D_FEATURE_LEVEL_11_0, + }; + UINT numFeatureLevels = ARRAYSIZE(featureLevels); + hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, createDeviceFlags, featureLevels, numFeatureLevels, D3D11_SDK_VERSION, &g_D3D11Device, &g_D3D11FeatureLevel, &g_D3D11Ctx); + if (FAILED(hr)) + return hr; + + // Get DXGI factory + IDXGIFactory1* dxgiFactory = nullptr; + { + IDXGIDevice* dxgiDevice = nullptr; + hr = g_D3D11Device->QueryInterface(__uuidof(IDXGIDevice), reinterpret_cast(&dxgiDevice)); + if (SUCCEEDED(hr)) + { + IDXGIAdapter* adapter = nullptr; + hr = dxgiDevice->GetAdapter(&adapter); + if (SUCCEEDED(hr)) + { + hr = adapter->GetParent(__uuidof(IDXGIFactory1), reinterpret_cast(&dxgiFactory)); + adapter->Release(); + } + dxgiDevice->Release(); + } + } + if (FAILED(hr)) + return hr; + + // Create swap chain + DXGI_SWAP_CHAIN_DESC sd; + ZeroMemory(&sd, sizeof(sd)); + sd.BufferCount = 1; + sd.BufferDesc.Width = width; + sd.BufferDesc.Height = height; + sd.BufferDesc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + sd.BufferDesc.RefreshRate.Numerator = 60; + sd.BufferDesc.RefreshRate.Denominator = 1; + sd.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; + sd.OutputWindow = g_Wnd; + sd.SampleDesc.Count = 1; + sd.SampleDesc.Quality = 0; + sd.Windowed = TRUE; + hr = dxgiFactory->CreateSwapChain(g_D3D11Device, &sd, &g_D3D11SwapChain); + + // Prevent Alt-Enter + dxgiFactory->MakeWindowAssociation(g_Wnd, DXGI_MWA_NO_ALT_ENTER); + dxgiFactory->Release(); + + if (FAILED(hr)) + return hr; + + // RTV + ID3D11Texture2D* pBackBuffer = nullptr; + hr = g_D3D11SwapChain->GetBuffer(0, __uuidof(ID3D11Texture2D), reinterpret_cast(&pBackBuffer)); + if (FAILED(hr)) + return hr; + hr = g_D3D11Device->CreateRenderTargetView(pBackBuffer, nullptr, &g_D3D11RenderTarget); + pBackBuffer->Release(); + if (FAILED(hr)) + return hr; + + g_D3D11Ctx->OMSetRenderTargets(1, &g_D3D11RenderTarget, nullptr); + + // Viewport + D3D11_VIEWPORT vp; + vp.Width = (float)width; + vp.Height = (float)height; + vp.MinDepth = 0.0f; + vp.MaxDepth = 1.0f; + vp.TopLeftX = 0; + vp.TopLeftY = 0; + g_D3D11Ctx->RSSetViewports(1, &vp); + + return S_OK; +} + +static void ShutdownD3DDevice() +{ + if (g_D3D11Ctx) g_D3D11Ctx->ClearState(); + + if (g_D3D11RenderTarget) g_D3D11RenderTarget->Release(); + if (g_D3D11SwapChain) g_D3D11SwapChain->Release(); + if (g_D3D11Ctx) g_D3D11Ctx->Release(); + if (g_D3D11Device) g_D3D11Device->Release(); +} diff --git a/examples/ToyPathTracer/Windows/VertexShader.hlsl b/examples/ToyPathTracer/Windows/VertexShader.hlsl new file mode 100644 index 00000000..956f4d56 --- /dev/null +++ b/examples/ToyPathTracer/Windows/VertexShader.hlsl @@ -0,0 +1,13 @@ +struct vs2ps +{ + float2 uv : TEXCOORD0; + float4 pos : SV_Position; +}; + +vs2ps main(uint vid : SV_VertexID) +{ + vs2ps o; + o.uv = float2((vid << 1) & 2, vid & 2); + o.pos = float4(o.uv * float2(2, 2) + float2(-1, -1), 0, 1); + return o; +} diff --git a/examples/ToyPathTracer/license.md b/examples/ToyPathTracer/license.md new file mode 100644 index 00000000..cf1ab25d --- /dev/null +++ b/examples/ToyPathTracer/license.md @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to