From 7893c939415fcf69803232a9a463835b8b14480b Mon Sep 17 00:00:00 2001 From: Yggdrasil75 Date: Fri, 5 Dec 2025 16:01:00 -0500 Subject: [PATCH] pushing to home. --- util/fp8.hpp | 268 +++++++++++++++++++++++++++++++++++++++++++ util/grid/grid32.hpp | 118 +++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 util/fp8.hpp create mode 100644 util/grid/grid32.hpp diff --git a/util/fp8.hpp b/util/fp8.hpp new file mode 100644 index 0000000..ffac25a --- /dev/null +++ b/util/fp8.hpp @@ -0,0 +1,268 @@ +// fp8.hpp +#pragma once +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include +#include +#endif + +class fp8_e4m3 { +private: + uint8_t data; + +public: + // Constructors + __host__ __device__ fp8_e4m3() : data(0) {} + + __host__ __device__ explicit fp8_e4m3(uint8_t val) : data(val) {} + + // Conversion from float32 + __host__ __device__ explicit fp8_e4m3(float f) { + #ifdef __CUDACC__ + data = float_to_fp8(f); + #else + data = cpu_float_to_fp8(f); + #endif + } + + // Conversion from float16 (CUDA only) + #ifdef __CUDACC__ + __host__ __device__ explicit fp8_e4m3(__half h) { + data = half_to_fp8(h); + } + #endif + + // Conversion to float32 + __host__ __device__ operator float() const { + #ifdef __CUDACC__ + return fp8_to_float(data); + #else + return cpu_fp8_to_float(data); + #endif + } + + // Arithmetic operators + __host__ __device__ fp8_e4m3 operator+(const fp8_e4m3& other) const { + return fp8_e4m3(float(*this) + float(other)); + } + + __host__ __device__ fp8_e4m3 operator-(const fp8_e4m3& other) const { + return fp8_e4m3(float(*this) - float(other)); + } + + __host__ __device__ fp8_e4m3 operator*(const fp8_e4m3& other) const { + return fp8_e4m3(float(*this) * float(other)); + } + + __host__ __device__ fp8_e4m3 operator/(const fp8_e4m3& other) const { + return fp8_e4m3(float(*this) / float(other)); + } + + // Compound assignment operators + __host__ __device__ fp8_e4m3& operator+=(const fp8_e4m3& other) { + *this = fp8_e4m3(float(*this) + float(other)); + return *this; + } + + __host__ __device__ fp8_e4m3& operator-=(const fp8_e4m3& other) { + *this = fp8_e4m3(float(*this) - float(other)); + return *this; + } + + __host__ __device__ fp8_e4m3& operator*=(const fp8_e4m3& other) { + *this = fp8_e4m3(float(*this) * float(other)); + return *this; + } + + __host__ __device__ fp8_e4m3& operator/=(const fp8_e4m3& other) { + *this = fp8_e4m3(float(*this) / float(other)); + return *this; + } + + // Comparison operators + __host__ __device__ bool operator==(const fp8_e4m3& other) const { + // Handle NaN and ±0.0 cases + if ((data & 0x7F) == 0x7F) return false; // NaN + if (data == other.data) return true; + return false; + } + + __host__ __device__ bool operator!=(const fp8_e4m3& other) const { + return !(*this == other); + } + + __host__ __device__ bool operator<(const fp8_e4m3& other) const { + return float(*this) < float(other); + } + + __host__ __device__ bool operator>(const fp8_e4m3& other) const { + return float(*this) > float(other); + } + + __host__ __device__ bool operator<=(const fp8_e4m3& other) const { + return float(*this) <= float(other); + } + + __host__ __device__ bool operator>=(const fp8_e4m3& other) const { + return float(*this) >= float(other); + } + + // Get raw data + __host__ __device__ uint8_t get_raw() const { return data; } + + // Special values + __host__ __device__ static fp8_e4m3 zero() { return fp8_e4m3(0x00); } + __host__ __device__ static fp8_e4m3 one() { return fp8_e4m3(0x3C); } // 1.0 + __host__ __device__ static fp8_e4m3 nan() { return fp8_e4m3(0x7F); } + __host__ __device__ static fp8_e4m3 inf() { return fp8_e4m3(0x78); } // +inf + __host__ __device__ static fp8_e4m3 neg_inf() { return fp8_e4m3(0xF8); } // -inf + + // Memory operations + __host__ __device__ static void memcpy(void* dst, const void* src, size_t count) { + ::memcpy(dst, src, count); + } + + __host__ __device__ static void memset(void* ptr, int value, size_t count) { + ::memset(ptr, value, count); + } + +private: + // CPU implementation (fast bit manipulation) + __host__ __device__ static uint8_t cpu_float_to_fp8(float f) { + uint32_t f_bits; + memcpy(&f_bits, &f, sizeof(float)); + + uint32_t sign = (f_bits >> 31) & 0x1; + int32_t exp = ((f_bits >> 23) & 0xFF) - 127; + uint32_t mantissa = f_bits & 0x7FFFFF; + + // Handle special cases + if (exp == 128) { // NaN or Inf + return (sign << 7) | 0x7F; // Preserve sign for NaN/Inf + } + + // Denormal handling + if (exp < -6) { + return sign << 7; // Underflow to zero + } + + // Clamp exponent to e4m3 range [-6, 7] + if (exp > 7) { + return (sign << 7) | 0x78; // Overflow to inf + } + + // Convert to fp8 format + uint32_t fp8_exp = (exp + 6) & 0xF; // Bias: -6 -> 0, 7 -> 13 + uint32_t fp8_mant = mantissa >> 20; // Keep top 3 bits + + // Round to nearest even + uint32_t rounding_bit = (mantissa >> 19) & 1; + uint32_t sticky_bits = (mantissa & 0x7FFFF) ? 1 : 0; + if (rounding_bit && (fp8_mant & 1 || sticky_bits)) { + fp8_mant++; + if (fp8_mant > 0x7) { // Mantissa overflow + fp8_mant = 0; + fp8_exp++; + if (fp8_exp > 0xF) { // Exponent overflow + return (sign << 7) | 0x78; // Infinity + } + } + } + + return (sign << 7) | (fp8_exp << 3) | (fp8_mant & 0x7); + } + + __host__ __device__ static float cpu_fp8_to_float(uint8_t fp8) { + uint32_t sign = (fp8 >> 7) & 0x1; + uint32_t exp = (fp8 >> 3) & 0xF; + uint32_t mant = fp8 & 0x7; + + // Handle special cases + if (exp == 0xF) { // NaN or Inf + uint32_t f_bits = (sign << 31) | (0xFF << 23) | (mant << 20); + float result; + memcpy(&result, &f_bits, sizeof(float)); + return result; + } + + if (exp == 0) { + // Denormal/subnormal + if (mant == 0) return sign ? -0.0f : 0.0f; + // Convert denormal + exp = -6; + mant = mant << 1; + } else { + exp -= 6; // Remove bias + } + + // Convert to float32 + uint32_t f_exp = (exp + 127) & 0xFF; + uint32_t f_mant = mant << 20; + uint32_t f_bits = (sign << 31) | (f_exp << 23) | f_mant; + + float result; + memcpy(&result, &f_bits, sizeof(float)); + return result; + } + + // CUDA implementation (using intrinsics when available) + #ifdef __CUDACC__ + __device__ static uint8_t float_to_fp8(float f) { + #if __CUDA_ARCH__ >= 890 // Hopper+ has native FP8 support + return __float_to_fp8_rn(f); + #else + return cpu_float_to_fp8(f); + #endif + } + + __device__ static float fp8_to_float(uint8_t fp8) { + #if __CUDA_ARCH__ >= 890 + return __fp8_to_float(fp8); + #else + return cpu_fp8_to_float(fp8); + #endif + } + + __device__ static uint8_t half_to_fp8(__half h) { + return float_to_fp8(__half2float(h)); + } + #else + // For non-CUDA, use CPU versions + __host__ __device__ static uint8_t float_to_fp8(float f) { + return cpu_float_to_fp8(f); + } + + __host__ __device__ static float fp8_to_float(uint8_t fp8) { + return cpu_fp8_to_float(fp8); + } + #endif +}; + +// Vectorized operations for performance +namespace fp8_ops { + // Convert array of floats to fp8 (efficient batch conversion) + static void convert_float_to_fp8(uint8_t* dst, const float* src, size_t count) { + #pragma omp parallel for simd if(count > 1024) + for (size_t i = 0; i < count; ++i) { + dst[i] = fp8_e4m3(src[i]).get_raw(); + } + } + + // Convert array of fp8 to floats + static void convert_fp8_to_float(float* dst, const uint8_t* src, size_t count) { + #pragma omp parallel for simd if(count > 1024) + for (size_t i = 0; i < count; ++i) { + dst[i] = fp8_e4m3(src[i]); + } + } + + // Direct memory operations + static void memset_fp8(void* ptr, fp8_e4m3 value, size_t count) { + uint8_t val = value.get_raw(); + ::memset(ptr, val, count); + } +} \ No newline at end of file diff --git a/util/grid/grid32.hpp b/util/grid/grid32.hpp new file mode 100644 index 0000000..6223ecb --- /dev/null +++ b/util/grid/grid32.hpp @@ -0,0 +1,118 @@ +#ifndef GRID_HPP +#define GRID_HPP + +#include "../vectorlogic/vec3.hpp" +#include "../vectorlogic/vec4.hpp" +#include "../noise/pnoise2.hpp" +#include +#include + +//template +class Node { + //static_assert(Dimension == 2 || Dimension == 3, "Dimensions must be 2 or 3"); +private: +public: + NodeType type; + Vec3f minBounds; + Vec3f maxBounds; + Vec4ui8 color; + enum NodeType { + BRANCH, + LEAF + }; + std::array children; + Node(const Vec3f min, const Vec3f max, NodeType t = LEAF) : type(t), minBounds(min), maxBounds(max), color(0,0,0,0) {} + + Vec3f getCenter() const { + return (minBounds + maxBounds) * 0.5f; + } + + int getChildIndex(const Vec3f& pos) const { + Vec3f c = getCenter(); + int index = 0; + if (pos.x >= c.x) index |= 1; + if (pos.y >= c.y) index |= 2; + if (pos.z >= c.z) index |= 4; + return index; + } + + std::pair getChildBounds(int childIndex) const { + Vec3f c = getCenter(); + Vec3f cMin = minBounds; + Vec3f cMax = maxBounds; + if (childIndex & 1) cMin.x = c.x; + else cMax.x = c.x; + + if (childIndex & 2) cMin.y = c.y; + else cMax.y = c.y; + + if (childIndex & 4) cMin.z = c.z; + else cMax.z = c.z; + + return {cMin, cMax}; + } + +}; + +class Grid { +private: + Node root; + Vec4ui8 DefaultBackgroundColor; + PNoise2 noisegen; + std::unordered_map Cache; +public: + Grid() : { + root = Node(Vec3f(0), Vec3f(0), Node::NodeType::BRANCH); + }; + + size_t insertVoxel(const Vec3f& pos, const Vec4ui8& color) { + if (!contains(pos)) { + return -1; + } + return insertRecusive(root.get(), pos, color, 0); + } + + void removeVoxel(const Vec3f& pos) { + bool removed = removeRecursive(root.get(), pos, 0); + if (removed) { + Cache.erase(pos); + } + } + + Vec4ui8 getVoxel(const Vec3f& pos) const { + Node* node = findNode(root.get(), pos, 0); + if (node && node->isLeaf()) { + return node->color; + } + return DefaultBackgroundColor; + } + + bool hasVoxel(const Vec3f& pos) const { + Node* node = findNode(root.get(), pos, 0); + return node && node->isLeaf(); + } + + bool rayIntersect(const Ray3& ray, Vec3f& hitPos, Vec4ui8& hitColor) const { + return rayintersectRecursive(root.get(), ray, hitPos, hitColor); + } + + std::unordered_map queryRegion(const Vec3f& min, const Vec3f& max) const { + std::unordered_map result; + queryRegionRecuse(root.get(), min, max, result); + return result; + } + + Grid& noiseGen(const Vec3f& min, const Vec3f& max, float minc = 0.1f, float maxc = 1.0f, bool genColor = true, int noiseMod = 42) { + TIME_FUNCTION; + noisegen = PNoise2(noiseMod); + std::vector poses; + std::vector colors; + + size_t estimatedSize = (max.x - min.x) * (max.y - min.y) * (max.z - min.z) * (maxc - minc); + poses.reserve(estimatedSize); + colors.reserve(estimatedSize); + + } +}; + +#endif