tracy/examples/ToyPathTracer/Source/Maths.h

#pragma once

#include <math.h>
#include <assert.h>
#include <stdint.h>
#include "Config.h"
#include "MathSimd.h"

#define kPI 3.1415926f

// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
#if DO_FLOAT3_WITH_SIMD


#if !defined(__arm__) && !defined(__arm64__)

// ---- SSE implementation

// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))

struct float3
{
    VM_INLINE float3() {}
    VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
    VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
    VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }
    VM_INLINE explicit float3(__m128 v) { m = v; }

    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }

    VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
    VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }

    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }

    void setX(float x)
    {
        m = _mm_move_ss(m, _mm_set_ss(x));
    }
    void setY(float y)
    {
        __m128 t = _mm_move_ss(m, _mm_set_ss(y));
        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
        m = _mm_move_ss(t, m);
    }
    void setZ(float z)
    {
        __m128 t = _mm_move_ss(m, _mm_set_ss(z));
        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
        m = _mm_move_ss(t, m);
    }

    __m128 m;
};

typedef float3 bool3;

VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }

VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }

VM_INLINE float hmin(float3 v)
{
    v = min(v, SHUFFLE3(v, 1, 0, 2));
    return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
}
VM_INLINE float hmax(float3 v)
{
    v = max(v, SHUFFLE3(v, 1, 0, 2));
    return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
}

VM_INLINE float3 cross(float3 a, float3 b)
{
    // x  <-  a.y*b.z - a.z*b.y
    // y  <-  a.z*b.x - a.x*b.z
    // z  <-  a.x*b.y - a.y*b.x
    // We can save a shuffle by grouping it in this wacky order:
    return (a.zxy()*b - a*b.zxy()).zxy();
}

// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }

VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }

#else // #if !defined(__arm__) && !defined(__arm64__)

// ---- NEON implementation

#include <arm_neon.h>

struct float3
{
    VM_INLINE float3() {}
    VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }
    VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }
    VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }
    VM_INLINE explicit float3(float32x4_t v) { m = v; }
    
    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
    
    VM_INLINE float3 yzx() const
    {
        float32x2_t low = vget_low_f32(m);
        float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);
        return float3(yzx);
    }
    VM_INLINE float3 zxy() const
    {
        float32x4_t p = m;
        p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];
        return float3(p);
    }
    
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
    
    void setX(float x)
    {
        m = vsetq_lane_f32(x, m, 0);
    }
    void setY(float y)
    {
        m = vsetq_lane_f32(y, m, 1);
    }
    void setZ(float z)
    {
        m = vsetq_lane_f32(z, m, 2);
    }
    
    float32x4_t m;
};

typedef float3 bool3;

VM_INLINE float32x4_t rcp_2(float32x4_t v)
{
    float32x4_t e = vrecpeq_f32(v);
    e = vmulq_f32(vrecpsq_f32(e, v), e);
    e = vmulq_f32(vrecpsq_f32(e, v), e);
    return e;
}

VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }

VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }

VM_INLINE float hmin(float3 v)
{
    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
    return vget_lane_f32(minOfMinOfHalfs, 0);
}
VM_INLINE float hmax(float3 v)
{
    float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);
    return vget_lane_f32(maxOfMaxOfHalfs, 0);
}

VM_INLINE float3 cross(float3 a, float3 b)
{
    // x  <-  a.y*b.z - a.z*b.y
    // y  <-  a.z*b.x - a.x*b.z
    // z  <-  a.x*b.y - a.y*b.x
    // We can save a shuffle by grouping it in this wacky order:
    return (a.zxy()*b - a*b.zxy()).zxy();
}

// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v)
{
    static const uint32x4_t movemask = { 1, 2, 4, 8 };
    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
    uint32x4_t t1 = vtstq_u32(t0, highbit);
    uint32x4_t t2 = vandq_u32(t1, movemask);
    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }

VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }


#endif // #else of #if !defined(__arm__) && !defined(__arm64__)

#else // #if DO_FLOAT3_WITH_SIMD

// ---- Simple scalar C implementation


struct float3
{
    float3() : x(0), y(0), z(0) {}
    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}

    float3 operator-() const { return float3(-x, -y, -z); }
    float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
    float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
    float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
    float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }

    VM_INLINE float getX() const { return x; }
    VM_INLINE float getY() const { return y; }
    VM_INLINE float getZ() const { return z; }
    VM_INLINE void setX(float x_) { x = x_; }
    VM_INLINE void setY(float y_) { y = y_; }
    VM_INLINE void setZ(float z_) { z = z_; }
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }

    float x, y, z;
};

VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
VM_INLINE float3 cross(const float3& a, const float3& b)
{
    return float3(
                  a.y*b.z - a.z*b.y,
                  -(a.x*b.z - a.z*b.x),
                  a.x*b.y - a.y*b.x
                  );
}
#endif // #else of #if DO_FLOAT3_WITH_SIMD

VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
VM_INLINE float sqLength(float3 v) { return dot(v, v); }
VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }


inline void AssertUnit(float3 v)
{
    assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
}

inline float3 reflect(float3 v, float3 n)
{
    return v - 2*dot(v,n)*n;
}

inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)
{
    AssertUnit(v);
    float dt = dot(v, n);
    float discr = 1.0f - nint*nint*(1-dt*dt);
    if (discr > 0)
    {
        outRefracted = nint * (v - n*dt) - n*sqrtf(discr);
        return true;
    }
    return false;
}
inline float schlick(float cosine, float ri)
{
    float r0 = (1-ri) / (1+ri);
    r0 = r0*r0;
    return r0 + (1-r0)*powf(1-cosine, 5);
}

struct Ray
{
    Ray() {}
    Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }

    float3 pointAt(float t) const { return orig + dir * t; }

    float3 orig;
    float3 dir;
};


struct Hit
{
    float3 pos;
    float3 normal;
    float t;
};


struct Sphere
{
    Sphere() : radius(1.0f), invRadius(0.0f) {}
    Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}

    void UpdateDerivedData() { invRadius = 1.0f/radius; }

    float3 center;
    float radius;
    float invRadius;
};


// data for all spheres in a "structure of arrays" layout
struct SpheresSoA
{
    SpheresSoA(int c)
    {
        count = c;
        // we'll be processing spheres in kSimdWidth chunks, so make sure to allocate
        // enough space
        simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;
        centerX = new float[simdCount];
        centerY = new float[simdCount];
        centerZ = new float[simdCount];
        sqRadius = new float[simdCount];
        invRadius = new float[simdCount];
        // set all data to "impossible sphere" state
        for (int i = count; i < simdCount; ++i)
        {
            centerX[i] = centerY[i] = centerZ[i] = 10000.0f;
            sqRadius[i] = 0.0f;
            invRadius[i] = 0.0f;
        }
    }
    ~SpheresSoA()
    {
        delete[] centerX;
        delete[] centerY;
        delete[] centerZ;
        delete[] sqRadius;
        delete[] invRadius;
    }
    float* centerX;
    float* centerY;
    float* centerZ;
    float* sqRadius;
    float* invRadius;
    int simdCount;
    int count;
};


int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);

float RandomFloat01(uint32_t& state);
float3 RandomInUnitDisk(uint32_t& state);
float3 RandomInUnitSphere(uint32_t& state);
float3 RandomUnitVector(uint32_t& state);

struct Camera
{
    Camera() {}
    // vfov is top to bottom in degrees
    Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)
    {
        lensRadius = aperture / 2;
        float theta = vfov*kPI/180;
        float halfHeight = tanf(theta/2);
        float halfWidth = aspect * halfHeight;
        origin = lookFrom;
        w = normalize(lookFrom - lookAt);
        u = normalize(cross(vup, w));
        v = cross(w, u);
        lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w;
        horizontal = 2*halfWidth*focusDist*u;
        vertical = 2*halfHeight*focusDist*v;
    }

    Ray GetRay(float s, float t, uint32_t& state) const
    {
        float3 rd = lensRadius * RandomInUnitDisk(state);
        float3 offset = u * rd.getX() + v * rd.getY();
        return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
    }

    float3 origin;
    float3 lowerLeftCorner;
    float3 horizontal;
    float3 vertical;
    float3 u, v, w;
    float lensRadius;
};
Add aras-p's ToyPathTracer. https://github.com/aras-p/ToyPathTracer b076563906169aa2f9e6d7218ef85decf81f8f72 2019-10-29 21:21:34 +00:00			`#pragma once`

			`#include <math.h>`
			`#include <assert.h>`
			`#include <stdint.h>`
			`#include "Config.h"`
			`#include "MathSimd.h"`

			`#define kPI 3.1415926f`

			`// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/`
			`#if DO_FLOAT3_WITH_SIMD`


			`#if !defined(__arm__) && !defined(__arm64__)`

			`// ---- SSE implementation`

			`// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).`
			`// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).`
			`#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))`

			`struct float3`
			`{`
			`VM_INLINE float3() {}`
			`VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }`
			`VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }`
			`VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }`
			`VM_INLINE explicit float3(__m128 v) { m = v; }`

			`VM_INLINE float getX() const { return _mm_cvtss_f32(m); }`
			`VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }`
			`VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }`

			`VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }`
			`VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }`

			`VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }`

			`void setX(float x)`
			`{`
			`m = _mm_move_ss(m, _mm_set_ss(x));`
			`}`
			`void setY(float y)`
			`{`
			`__m128 t = _mm_move_ss(m, _mm_set_ss(y));`
			`t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));`
			`m = _mm_move_ss(t, m);`
			`}`
			`void setZ(float z)`
			`{`
			`__m128 t = _mm_move_ss(m, _mm_set_ss(z));`
			`t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));`
			`m = _mm_move_ss(t, m);`
			`}`

			`__m128 m;`
			`};`

			`typedef float3 bool3;`

			`VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }`
			`VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }`
			`VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }`
			`VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }`
			`VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }`
			`VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }`
			`VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }`
			`VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }`
			`VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }`
			`VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }`
			`VM_INLINE float3& operator= (float3 &a, float3 b) { a = a b; return a; }`
			`VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }`
			`VM_INLINE float3& operator= (float3 &a, float b) { a = a b; return a; }`
			`VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }`
			`VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }`
			`VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }`
			`VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }`

			`VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }`

			`VM_INLINE float hmin(float3 v)`
			`{`
			`v = min(v, SHUFFLE3(v, 1, 0, 2));`
			`return min(v, SHUFFLE3(v, 2, 0, 1)).getX();`
			`}`
			`VM_INLINE float hmax(float3 v)`
			`{`
			`v = max(v, SHUFFLE3(v, 1, 0, 2));`
			`return max(v, SHUFFLE3(v, 2, 0, 1)).getX();`
			`}`

			`VM_INLINE float3 cross(float3 a, float3 b)`
			`{`
			`// x <- a.yb.z - a.zb.y`
			`// y <- a.zb.x - a.xb.z`
			`// z <- a.xb.y - a.yb.x`
			`// We can save a shuffle by grouping it in this wacky order:`
			`return (a.zxy()b - ab.zxy()).zxy();`
			`}`

			`// Returns a 3-bit code where bit0..bit2 is X..Z`
			`VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }`
			`// Once we have a comparison, we can branch based on its results:`
			`VM_INLINE bool any(bool3 v) { return mask(v) != 0; }`
			`VM_INLINE bool all(bool3 v) { return mask(v) == 7; }`

			`VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }`
			`VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }`
			`VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }`

			`#else // #if !defined(__arm__) && !defined(__arm64__)`

			`// ---- NEON implementation`

			`#include <arm_neon.h>`

			`struct float3`
			`{`
			`VM_INLINE float3() {}`
			`VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }`
			`VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }`
			`VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }`
			`VM_INLINE explicit float3(float32x4_t v) { m = v; }`

			`VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }`
			`VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }`
			`VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }`

			`VM_INLINE float3 yzx() const`
			`{`
			`float32x2_t low = vget_low_f32(m);`
			`float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);`
			`return float3(yzx);`
			`}`
			`VM_INLINE float3 zxy() const`
			`{`
			`float32x4_t p = m;`
			`p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];`
			`return float3(p);`
			`}`

			`VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }`

			`void setX(float x)`
			`{`
			`m = vsetq_lane_f32(x, m, 0);`
			`}`
			`void setY(float y)`
			`{`
			`m = vsetq_lane_f32(y, m, 1);`
			`}`
			`void setZ(float z)`
			`{`
			`m = vsetq_lane_f32(z, m, 2);`
			`}`

			`float32x4_t m;`
			`};`

			`typedef float3 bool3;`

			`VM_INLINE float32x4_t rcp_2(float32x4_t v)`
			`{`
			`float32x4_t e = vrecpeq_f32(v);`
			`e = vmulq_f32(vrecpsq_f32(e, v), e);`
			`e = vmulq_f32(vrecpsq_f32(e, v), e);`
			`return e;`
			`}`

			`VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }`
			`VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }`
			`VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }`
			`VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }`
			`VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }`
			`VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }`
			`VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }`
			`VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }`
			`VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }`
			`VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }`
			`VM_INLINE float3& operator= (float3 &a, float3 b) { a = a b; return a; }`
			`VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }`
			`VM_INLINE float3& operator= (float3 &a, float b) { a = a b; return a; }`
			`VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }`
			`VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }`
			`VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }`
			`VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }`
			`VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }`
			`VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }`

			`VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }`

			`VM_INLINE float hmin(float3 v)`
			`{`
			`float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));`
			`float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);`
			`return vget_lane_f32(minOfMinOfHalfs, 0);`
			`}`
			`VM_INLINE float hmax(float3 v)`
			`{`
			`float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));`
			`float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);`
			`return vget_lane_f32(maxOfMaxOfHalfs, 0);`
			`}`

			`VM_INLINE float3 cross(float3 a, float3 b)`
			`{`
			`// x <- a.yb.z - a.zb.y`
			`// y <- a.zb.x - a.xb.z`
			`// z <- a.xb.y - a.yb.x`
			`// We can save a shuffle by grouping it in this wacky order:`
			`return (a.zxy()b - ab.zxy()).zxy();`
			`}`

			`// Returns a 3-bit code where bit0..bit2 is X..Z`
			`VM_INLINE unsigned mask(float3 v)`
			`{`
			`static const uint32x4_t movemask = { 1, 2, 4, 8 };`
			`static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };`
			`uint32x4_t t0 = vreinterpretq_u32_f32(v.m);`
			`uint32x4_t t1 = vtstq_u32(t0, highbit);`
			`uint32x4_t t2 = vandq_u32(t1, movemask);`
			`uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));`
			`return vget_lane_u32(t3, 0) \| vget_lane_u32(t3, 1);`
			`}`
			`// Once we have a comparison, we can branch based on its results:`
			`VM_INLINE bool any(bool3 v) { return mask(v) != 0; }`
			`VM_INLINE bool all(bool3 v) { return mask(v) == 7; }`

			`VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }`
			`VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }`
			`VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }`


			`#endif // #else of #if !defined(__arm__) && !defined(__arm64__)`

			`#else // #if DO_FLOAT3_WITH_SIMD`

			`// ---- Simple scalar C implementation`


			`struct float3`
			`{`
			`float3() : x(0), y(0), z(0) {}`
			`float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}`

			`float3 operator-() const { return float3(-x, -y, -z); }`
			`float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }`
			`float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }`
			`float3& operator=(const float3& o) { x=o.x; y=o.y; z=o.z; return *this; }`
			`float3& operator=(float o) { x=o; y=o; z=o; return *this; }`

			`VM_INLINE float getX() const { return x; }`
			`VM_INLINE float getY() const { return y; }`
			`VM_INLINE float getZ() const { return z; }`
			`VM_INLINE void setX(float x_) { x = x_; }`
			`VM_INLINE void setY(float y_) { y = y_; }`
			`VM_INLINE void setZ(float z_) { z = z_; }`
			`VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }`

			`float x, y, z;`
			`};`

			`VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }`
			`VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }`
			`VM_INLINE float3 operator(const float3& a, const float3& b) { return float3(a.xb.x,a.yb.y,a.zb.z); }`
			`VM_INLINE float3 operator(const float3& a, float b) { return float3(a.xb,a.yb,a.zb); }`
			`VM_INLINE float3 operator(float a, const float3& b) { return float3(ab.x,ab.y,ab.z); }`
			`VM_INLINE float dot(const float3& a, const float3& b) { return a.xb.x+a.yb.y+a.z*b.z; }`
			`VM_INLINE float3 cross(const float3& a, const float3& b)`
			`{`
			`return float3(`
			`a.yb.z - a.zb.y,`
			`-(a.xb.z - a.zb.x),`
			`a.xb.y - a.yb.x`
			`);`
			`}`
			`#endif // #else of #if DO_FLOAT3_WITH_SIMD`

			`VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }`
			`VM_INLINE float sqLength(float3 v) { return dot(v, v); }`
			`VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }`
			`VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }`


			`inline void AssertUnit(float3 v)`
			`{`
			`assert(fabsf(sqLength(v) - 1.0f) < 0.01f);`
			`}`

			`inline float3 reflect(float3 v, float3 n)`
			`{`
			`return v - 2dot(v,n)n;`
			`}`

			`inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)`
			`{`
			`AssertUnit(v);`
			`float dt = dot(v, n);`
			`float discr = 1.0f - nintnint(1-dt*dt);`
			`if (discr > 0)`
			`{`
			`outRefracted = nint * (v - ndt) - nsqrtf(discr);`
			`return true;`
			`}`
			`return false;`
			`}`
			`inline float schlick(float cosine, float ri)`
			`{`
			`float r0 = (1-ri) / (1+ri);`
			`r0 = r0*r0;`
			`return r0 + (1-r0)*powf(1-cosine, 5);`
			`}`

			`struct Ray`
			`{`
			`Ray() {}`
			`Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }`

			`float3 pointAt(float t) const { return orig + dir * t; }`

			`float3 orig;`
			`float3 dir;`
			`};`


			`struct Hit`
			`{`
			`float3 pos;`
			`float3 normal;`
			`float t;`
			`};`


			`struct Sphere`
			`{`
			`Sphere() : radius(1.0f), invRadius(0.0f) {}`
			`Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}`

			`void UpdateDerivedData() { invRadius = 1.0f/radius; }`

			`float3 center;`
			`float radius;`
			`float invRadius;`
			`};`


			`// data for all spheres in a "structure of arrays" layout`
			`struct SpheresSoA`
			`{`
			`SpheresSoA(int c)`
			`{`
			`count = c;`
			`// we'll be processing spheres in kSimdWidth chunks, so make sure to allocate`
			`// enough space`
			`simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;`
			`centerX = new float[simdCount];`
			`centerY = new float[simdCount];`
			`centerZ = new float[simdCount];`
			`sqRadius = new float[simdCount];`
			`invRadius = new float[simdCount];`
			`// set all data to "impossible sphere" state`
			`for (int i = count; i < simdCount; ++i)`
			`{`
			`centerX[i] = centerY[i] = centerZ[i] = 10000.0f;`
			`sqRadius[i] = 0.0f;`
			`invRadius[i] = 0.0f;`
			`}`
			`}`
			`~SpheresSoA()`
			`{`
			`delete[] centerX;`
			`delete[] centerY;`
			`delete[] centerZ;`
			`delete[] sqRadius;`
			`delete[] invRadius;`
			`}`
			`float* centerX;`
			`float* centerY;`
			`float* centerZ;`
			`float* sqRadius;`
			`float* invRadius;`
			`int simdCount;`
			`int count;`
			`};`


			`int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);`

			`float RandomFloat01(uint32_t& state);`
			`float3 RandomInUnitDisk(uint32_t& state);`
			`float3 RandomInUnitSphere(uint32_t& state);`
			`float3 RandomUnitVector(uint32_t& state);`

			`struct Camera`
			`{`
			`Camera() {}`
			`// vfov is top to bottom in degrees`
			`Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)`
			`{`
			`lensRadius = aperture / 2;`
			`float theta = vfov*kPI/180;`
			`float halfHeight = tanf(theta/2);`
			`float halfWidth = aspect * halfHeight;`
			`origin = lookFrom;`
			`w = normalize(lookFrom - lookAt);`
			`u = normalize(cross(vup, w));`
			`v = cross(w, u);`
			`lowerLeftCorner = origin - halfWidthfocusDistu - halfHeightfocusDistv - focusDist*w;`
			`horizontal = 2halfWidthfocusDist*u;`
			`vertical = 2halfHeightfocusDist*v;`
			`}`

			`Ray GetRay(float s, float t, uint32_t& state) const`
			`{`
			`float3 rd = lensRadius * RandomInUnitDisk(state);`
			`float3 offset = u * rd.getX() + v * rd.getY();`
			`return Ray(origin + offset, normalize(lowerLeftCorner + shorizontal + tvertical - origin - offset));`
			`}`

			`float3 origin;`
			`float3 lowerLeftCorner;`
			`float3 horizontal;`
			`float3 vertical;`
			`float3 u, v, w;`
			`float lensRadius;`
			`};`