#pragma once

#include <math.h>
#include <assert.h>
#include <stdint.h>
#include "Config.h"
#include "MathSimd.h"

#define kPI 3.1415926f

// SSE/SIMD vector largely based on http://www.codersnotes.com/notes/maths-lib-2016/
#if DO_FLOAT3_WITH_SIMD


#if !defined(__arm__) && !defined(__arm64__)

// ---- SSE implementation

// SHUFFLE3(v, 0,1,2) leaves the vector unchanged (v.xyz).
// SHUFFLE3(v, 0,0,0) splats the X (v.xxx).
#define SHUFFLE3(V, X,Y,Z) float3(_mm_shuffle_ps((V).m, (V).m, _MM_SHUFFLE(Z,Z,Y,X)))

struct float3
{
    VM_INLINE float3() {}
    VM_INLINE explicit float3(const float *p) { m = _mm_set_ps(p[2], p[2], p[1], p[0]); }
    VM_INLINE explicit float3(float x, float y, float z) { m = _mm_set_ps(z, z, y, x); }
    VM_INLINE explicit float3(float v) { m = _mm_set1_ps(v); }
    VM_INLINE explicit float3(__m128 v) { m = v; }

    VM_INLINE float getX() const { return _mm_cvtss_f32(m); }
    VM_INLINE float getY() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(1, 1, 1, 1))); }
    VM_INLINE float getZ() const { return _mm_cvtss_f32(_mm_shuffle_ps(m, m, _MM_SHUFFLE(2, 2, 2, 2))); }

    VM_INLINE float3 yzx() const { return SHUFFLE3(*this, 1, 2, 0); }
    VM_INLINE float3 zxy() const { return SHUFFLE3(*this, 2, 0, 1); }

    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }

    void setX(float x)
    {
        m = _mm_move_ss(m, _mm_set_ss(x));
    }
    void setY(float y)
    {
        __m128 t = _mm_move_ss(m, _mm_set_ss(y));
        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 2, 0, 0));
        m = _mm_move_ss(t, m);
    }
    void setZ(float z)
    {
        __m128 t = _mm_move_ss(m, _mm_set_ss(z));
        t = _mm_shuffle_ps(t, t, _MM_SHUFFLE(3, 0, 1, 0));
        m = _mm_move_ss(t, m);
    }

    __m128 m;
};

typedef float3 bool3;

VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = _mm_add_ps(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = _mm_sub_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = _mm_mul_ps(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { a.m = _mm_div_ps(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = _mm_mul_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { a.m = _mm_div_ps(a.m, _mm_set1_ps(b)); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = _mm_mul_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { b.m = _mm_div_ps(_mm_set1_ps(a), b.m); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = _mm_cmpeq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = _mm_cmpneq_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = _mm_cmplt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = _mm_cmpgt_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = _mm_cmple_ps(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = _mm_cmpge_ps(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = _mm_min_ps(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = _mm_max_ps(a.m, b.m); return a; }

VM_INLINE float3 operator- (float3 a) { return float3(_mm_setzero_ps()) - a; }

VM_INLINE float hmin(float3 v)
{
    v = min(v, SHUFFLE3(v, 1, 0, 2));
    return min(v, SHUFFLE3(v, 2, 0, 1)).getX();
}
VM_INLINE float hmax(float3 v)
{
    v = max(v, SHUFFLE3(v, 1, 0, 2));
    return max(v, SHUFFLE3(v, 2, 0, 1)).getX();
}

VM_INLINE float3 cross(float3 a, float3 b)
{
    // x  <-  a.y*b.z - a.z*b.y
    // y  <-  a.z*b.x - a.x*b.z
    // z  <-  a.x*b.y - a.y*b.x
    // We can save a shuffle by grouping it in this wacky order:
    return (a.zxy()*b - a*b.zxy()).zxy();
}

// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v) { return _mm_movemask_ps(v.m) & 7; }
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }

VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }

#else // #if !defined(__arm__) && !defined(__arm64__)

// ---- NEON implementation

#include <arm_neon.h>

struct float3
{
    VM_INLINE float3() {}
    VM_INLINE explicit float3(const float *p) { float v[4] = {p[0], p[1], p[2], 0}; m = vld1q_f32(v); }
    VM_INLINE explicit float3(float x, float y, float z) { float v[4] = {x, y, z, 0}; m = vld1q_f32(v); }
    VM_INLINE explicit float3(float v) { m = vdupq_n_f32(v); }
    VM_INLINE explicit float3(float32x4_t v) { m = v; }
    
    VM_INLINE float getX() const { return vgetq_lane_f32(m, 0); }
    VM_INLINE float getY() const { return vgetq_lane_f32(m, 1); }
    VM_INLINE float getZ() const { return vgetq_lane_f32(m, 2); }
    
    VM_INLINE float3 yzx() const
    {
        float32x2_t low = vget_low_f32(m);
        float32x4_t yzx = vcombine_f32(vext_f32(low, vget_high_f32(m), 1), low);
        return float3(yzx);
    }
    VM_INLINE float3 zxy() const
    {
        float32x4_t p = m;
        p = vuzpq_f32(vreinterpretq_f32_s32(vextq_s32(vreinterpretq_s32_f32(p), vreinterpretq_s32_f32(p), 1)), p).val[1];
        return float3(p);
    }
    
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }
    
    void setX(float x)
    {
        m = vsetq_lane_f32(x, m, 0);
    }
    void setY(float y)
    {
        m = vsetq_lane_f32(y, m, 1);
    }
    void setZ(float z)
    {
        m = vsetq_lane_f32(z, m, 2);
    }
    
    float32x4_t m;
};

typedef float3 bool3;

VM_INLINE float32x4_t rcp_2(float32x4_t v)
{
    float32x4_t e = vrecpeq_f32(v);
    e = vmulq_f32(vrecpsq_f32(e, v), e);
    e = vmulq_f32(vrecpsq_f32(e, v), e);
    return e;
}

VM_INLINE float3 operator+ (float3 a, float3 b) { a.m = vaddq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator- (float3 a, float3 b) { a.m = vsubq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator* (float3 a, float3 b) { a.m = vmulq_f32(a.m, b.m); return a; }
VM_INLINE float3 operator/ (float3 a, float3 b) { float32x4_t recip = rcp_2(b.m); a.m = vmulq_f32(a.m, recip); return a; }
VM_INLINE float3 operator* (float3 a, float b) { a.m = vmulq_f32(a.m, vdupq_n_f32(b)); return a; }
VM_INLINE float3 operator/ (float3 a, float b) { float32x4_t recip = rcp_2(vdupq_n_f32(b)); a.m = vmulq_f32(a.m, recip); return a; }
VM_INLINE float3 operator* (float a, float3 b) { b.m = vmulq_f32(vdupq_n_f32(a), b.m); return b; }
VM_INLINE float3 operator/ (float a, float3 b) { float32x4_t recip = rcp_2(b.m); b.m = vmulq_f32(vdupq_n_f32(a), recip); return b; }
VM_INLINE float3& operator+= (float3 &a, float3 b) { a = a + b; return a; }
VM_INLINE float3& operator-= (float3 &a, float3 b) { a = a - b; return a; }
VM_INLINE float3& operator*= (float3 &a, float3 b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float3 b) { a = a / b; return a; }
VM_INLINE float3& operator*= (float3 &a, float b) { a = a * b; return a; }
VM_INLINE float3& operator/= (float3 &a, float b) { a = a / b; return a; }
VM_INLINE bool3 operator==(float3 a, float3 b) { a.m = vceqq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator!=(float3 a, float3 b) { a.m = vmvnq_u32(vceqq_f32(a.m, b.m)); return a; }
VM_INLINE bool3 operator< (float3 a, float3 b) { a.m = vcltq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator> (float3 a, float3 b) { a.m = vcgtq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator<=(float3 a, float3 b) { a.m = vcleq_f32(a.m, b.m); return a; }
VM_INLINE bool3 operator>=(float3 a, float3 b) { a.m = vcgeq_f32(a.m, b.m); return a; }
VM_INLINE float3 min(float3 a, float3 b) { a.m = vminq_f32(a.m, b.m); return a; }
VM_INLINE float3 max(float3 a, float3 b) { a.m = vmaxq_f32(a.m, b.m); return a; }

VM_INLINE float3 operator- (float3 a) { a.m = vnegq_f32(a.m); return a; }

VM_INLINE float hmin(float3 v)
{
    float32x2_t minOfHalfs = vpmin_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t minOfMinOfHalfs = vpmin_f32(minOfHalfs, minOfHalfs);
    return vget_lane_f32(minOfMinOfHalfs, 0);
}
VM_INLINE float hmax(float3 v)
{
    float32x2_t maxOfHalfs = vpmax_f32(vget_low_f32(v.m), vget_high_f32(v.m));
    float32x2_t maxOfMaxOfHalfs = vpmax_f32(maxOfHalfs, maxOfHalfs);
    return vget_lane_f32(maxOfMaxOfHalfs, 0);
}

VM_INLINE float3 cross(float3 a, float3 b)
{
    // x  <-  a.y*b.z - a.z*b.y
    // y  <-  a.z*b.x - a.x*b.z
    // z  <-  a.x*b.y - a.y*b.x
    // We can save a shuffle by grouping it in this wacky order:
    return (a.zxy()*b - a*b.zxy()).zxy();
}

// Returns a 3-bit code where bit0..bit2 is X..Z
VM_INLINE unsigned mask(float3 v)
{
    static const uint32x4_t movemask = { 1, 2, 4, 8 };
    static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
    uint32x4_t t0 = vreinterpretq_u32_f32(v.m);
    uint32x4_t t1 = vtstq_u32(t0, highbit);
    uint32x4_t t2 = vandq_u32(t1, movemask);
    uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
    return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
}
// Once we have a comparison, we can branch based on its results:
VM_INLINE bool any(bool3 v) { return mask(v) != 0; }
VM_INLINE bool all(bool3 v) { return mask(v) == 7; }

VM_INLINE float3 clamp(float3 t, float3 a, float3 b) { return min(max(t, a), b); }
VM_INLINE float sum(float3 v) { return v.getX() + v.getY() + v.getZ(); }
VM_INLINE float dot(float3 a, float3 b) { return sum(a*b); }


#endif // #else of #if !defined(__arm__) && !defined(__arm64__)

#else // #if DO_FLOAT3_WITH_SIMD

// ---- Simple scalar C implementation


struct float3
{
    float3() : x(0), y(0), z(0) {}
    float3(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}

    float3 operator-() const { return float3(-x, -y, -z); }
    float3& operator+=(const float3& o) { x+=o.x; y+=o.y; z+=o.z; return *this; }
    float3& operator-=(const float3& o) { x-=o.x; y-=o.y; z-=o.z; return *this; }
    float3& operator*=(const float3& o) { x*=o.x; y*=o.y; z*=o.z; return *this; }
    float3& operator*=(float o) { x*=o; y*=o; z*=o; return *this; }

    VM_INLINE float getX() const { return x; }
    VM_INLINE float getY() const { return y; }
    VM_INLINE float getZ() const { return z; }
    VM_INLINE void setX(float x_) { x = x_; }
    VM_INLINE void setY(float y_) { y = y_; }
    VM_INLINE void setZ(float z_) { z = z_; }
    VM_INLINE void store(float *p) const { p[0] = getX(); p[1] = getY(); p[2] = getZ(); }

    float x, y, z;
};

VM_INLINE float3 operator+(const float3& a, const float3& b) { return float3(a.x+b.x,a.y+b.y,a.z+b.z); }
VM_INLINE float3 operator-(const float3& a, const float3& b) { return float3(a.x-b.x,a.y-b.y,a.z-b.z); }
VM_INLINE float3 operator*(const float3& a, const float3& b) { return float3(a.x*b.x,a.y*b.y,a.z*b.z); }
VM_INLINE float3 operator*(const float3& a, float b) { return float3(a.x*b,a.y*b,a.z*b); }
VM_INLINE float3 operator*(float a, const float3& b) { return float3(a*b.x,a*b.y,a*b.z); }
VM_INLINE float dot(const float3& a, const float3& b) { return a.x*b.x+a.y*b.y+a.z*b.z; }
VM_INLINE float3 cross(const float3& a, const float3& b)
{
    return float3(
                  a.y*b.z - a.z*b.y,
                  -(a.x*b.z - a.z*b.x),
                  a.x*b.y - a.y*b.x
                  );
}
#endif // #else of #if DO_FLOAT3_WITH_SIMD

VM_INLINE float length(float3 v) { return sqrtf(dot(v, v)); }
VM_INLINE float sqLength(float3 v) { return dot(v, v); }
VM_INLINE float3 normalize(float3 v) { return v * (1.0f / length(v)); }
VM_INLINE float3 lerp(float3 a, float3 b, float t) { return a + (b-a)*t; }


inline void AssertUnit(float3 v)
{
    assert(fabsf(sqLength(v) - 1.0f) < 0.01f);
}

inline float3 reflect(float3 v, float3 n)
{
    return v - 2*dot(v,n)*n;
}

inline bool refract(float3 v, float3 n, float nint, float3& outRefracted)
{
    AssertUnit(v);
    float dt = dot(v, n);
    float discr = 1.0f - nint*nint*(1-dt*dt);
    if (discr > 0)
    {
        outRefracted = nint * (v - n*dt) - n*sqrtf(discr);
        return true;
    }
    return false;
}
inline float schlick(float cosine, float ri)
{
    float r0 = (1-ri) / (1+ri);
    r0 = r0*r0;
    return r0 + (1-r0)*powf(1-cosine, 5);
}

struct Ray
{
    Ray() {}
    Ray(float3 orig_, float3 dir_) : orig(orig_), dir(dir_) { AssertUnit(dir); }

    float3 pointAt(float t) const { return orig + dir * t; }

    float3 orig;
    float3 dir;
};


struct Hit
{
    float3 pos;
    float3 normal;
    float t;
};


struct Sphere
{
    Sphere() : radius(1.0f), invRadius(0.0f) {}
    Sphere(float3 center_, float radius_) : center(center_), radius(radius_), invRadius(0.0f) {}

    void UpdateDerivedData() { invRadius = 1.0f/radius; }

    float3 center;
    float radius;
    float invRadius;
};


// data for all spheres in a "structure of arrays" layout
struct SpheresSoA
{
    SpheresSoA(int c)
    {
        count = c;
        // we'll be processing spheres in kSimdWidth chunks, so make sure to allocate
        // enough space
        simdCount = (c + (kSimdWidth - 1)) / kSimdWidth * kSimdWidth;
        centerX = new float[simdCount];
        centerY = new float[simdCount];
        centerZ = new float[simdCount];
        sqRadius = new float[simdCount];
        invRadius = new float[simdCount];
        // set all data to "impossible sphere" state
        for (int i = count; i < simdCount; ++i)
        {
            centerX[i] = centerY[i] = centerZ[i] = 10000.0f;
            sqRadius[i] = 0.0f;
            invRadius[i] = 0.0f;
        }
    }
    ~SpheresSoA()
    {
        delete[] centerX;
        delete[] centerY;
        delete[] centerZ;
        delete[] sqRadius;
        delete[] invRadius;
    }
    float* centerX;
    float* centerY;
    float* centerZ;
    float* sqRadius;
    float* invRadius;
    int simdCount;
    int count;
};


int HitSpheres(const Ray& r, const SpheresSoA& spheres, float tMin, float tMax, Hit& outHit);

float RandomFloat01(uint32_t& state);
float3 RandomInUnitDisk(uint32_t& state);
float3 RandomInUnitSphere(uint32_t& state);
float3 RandomUnitVector(uint32_t& state);

struct Camera
{
    Camera() {}
    // vfov is top to bottom in degrees
    Camera(const float3& lookFrom, const float3& lookAt, const float3& vup, float vfov, float aspect, float aperture, float focusDist)
    {
        lensRadius = aperture / 2;
        float theta = vfov*kPI/180;
        float halfHeight = tanf(theta/2);
        float halfWidth = aspect * halfHeight;
        origin = lookFrom;
        w = normalize(lookFrom - lookAt);
        u = normalize(cross(vup, w));
        v = cross(w, u);
        lowerLeftCorner = origin - halfWidth*focusDist*u - halfHeight*focusDist*v - focusDist*w;
        horizontal = 2*halfWidth*focusDist*u;
        vertical = 2*halfHeight*focusDist*v;
    }

    Ray GetRay(float s, float t, uint32_t& state) const
    {
        float3 rd = lensRadius * RandomInUnitDisk(state);
        float3 offset = u * rd.getX() + v * rd.getY();
        return Ray(origin + offset, normalize(lowerLeftCorner + s*horizontal + t*vertical - origin - offset));
    }

    float3 origin;
    float3 lowerLeftCorner;
    float3 horizontal;
    float3 vertical;
    float3 u, v, w;
    float lensRadius;
};