From a3509f6cdf1bba2bf4bbf782065b5b582ba10729 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Tue, 16 Feb 2021 22:56:45 -0700
Subject: [PATCH 01/55] removing unified memory for framebuffers, as this hurts
 multigpu performance in non-nvlink configurations

---
 src/nvisii/nvisii.cpp | 253 ++++++++++++++++--------------------------
 1 file changed, 94 insertions(+), 159 deletions(-)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 0e9079c5..0d635a38 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -263,62 +263,6 @@ int getDeviceCount() {
     return owlGetDeviceCount(OptixData.context);
 }
 
-OWLModule moduleCreate(OWLContext context, const char* ptxCode)
-{
-    return owlModuleCreate(context, ptxCode);
-}
-
-OWLBuffer managedMemoryBufferCreate(OWLContext context, OWLDataType type, size_t count, void* init)
-{
-    return owlManagedMemoryBufferCreate(context, type, count, init);
-}
-
-OWLBuffer deviceBufferCreate(OWLContext context, OWLDataType type, size_t count, void* init)
-{
-    return owlDeviceBufferCreate(context, type, count, init);
-}
-
-void bufferDestroy(OWLBuffer buffer)
-{
-    owlBufferDestroy(buffer);
-}
-
-void bufferResize(OWLBuffer buffer, size_t newItemCount) {
-    owlBufferResize(buffer, newItemCount);
-}
-
-const void* bufferGetPointer(OWLBuffer buffer, int deviceId)
-{
-    return owlBufferGetPointer(buffer, deviceId);
-}
-
-void bufferUpload(OWLBuffer buffer, const void *hostPtr)
-{
-    owlBufferUpload(buffer, hostPtr);
-}
-
-CUstream getStream(OWLContext context, int deviceId)
-{
-    return owlContextGetStream(context, deviceId);
-}
-
-OptixDeviceContext getOptixContext(OWLContext context, int deviceID)
-{
-    return owlContextGetOptixContext(context, deviceID);
-}
-
-void buildPrograms(OWLContext context) {
-    owlBuildPrograms(context);
-}
-
-void buildPipeline(OWLContext context) {
-    owlBuildPipeline(context);
-}
-
-void buildSBT(OWLContext context) {
-    owlBuildSBT(context);
-}
-
 OWLMissProg missProgCreate(OWLContext context, OWLModule module, const char *programName, size_t sizeOfVarStruct, OWLVarDecl *vars, size_t numVars)
 {
     return owlMissProgCreate(context, module, programName, sizeOfVarStruct, vars, numVars);
@@ -488,12 +432,12 @@ void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
 
     OD.LP.frameSize.x = width;
     OD.LP.frameSize.y = height;
-    bufferResize(OD.frameBuffer, width * height);
-    bufferResize(OD.normalBuffer, width * height);
-    bufferResize(OD.albedoBuffer, width * height);
-    bufferResize(OD.scratchBuffer, width * height);
-    bufferResize(OD.mvecBuffer, width * height);    
-    bufferResize(OD.accumBuffer, width * height);
+    owlBufferResize(OD.frameBuffer, width * height);
+    owlBufferResize(OD.normalBuffer, width * height);
+    owlBufferResize(OD.albedoBuffer, width * height);
+    owlBufferResize(OD.scratchBuffer, width * height);
+    owlBufferResize(OD.mvecBuffer, width * height);    
+    owlBufferResize(OD.accumBuffer, width * height);
     
     // Reconfigure denoiser
     optixDenoiserComputeMemoryResources(OD.denoiser, OD.LP.frameSize.x, OD.LP.frameSize.y, &OD.denoiserSizes);
@@ -503,18 +447,18 @@ void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
     #else
     scratchSizeInBytes = OD.denoiserSizes.withOverlapScratchSizeInBytes;
     #endif
-    bufferResize(OD.denoiserScratchBuffer, scratchSizeInBytes);
-    bufferResize(OD.denoiserStateBuffer, OD.denoiserSizes.stateSizeInBytes);
+    owlBufferResize(OD.denoiserScratchBuffer, scratchSizeInBytes);
+    owlBufferResize(OD.denoiserStateBuffer, OD.denoiserSizes.stateSizeInBytes);
     
-    auto cudaStream = getStream(OD.context, 0);
+    auto cudaStream = owlContextGetStream(OD.context, 0);
     optixDenoiserSetup (
         OD.denoiser, 
         (cudaStream_t) cudaStream, 
         (unsigned int) OD.LP.frameSize.x, 
         (unsigned int) OD.LP.frameSize.y, 
-        (CUdeviceptr) bufferGetPointer(OD.denoiserStateBuffer, 0), 
+        (CUdeviceptr) owlBufferGetPointer(OD.denoiserStateBuffer, 0), 
         OD.denoiserSizes.stateSizeInBytes,
-        (CUdeviceptr) bufferGetPointer(OD.denoiserScratchBuffer, 0), 
+        (CUdeviceptr) owlBufferGetPointer(OD.denoiserScratchBuffer, 0), 
         scratchSizeInBytes
     );
 
@@ -551,7 +495,7 @@ void initializeOptix(bool headless)
     owlEnableMotionBlur(OD.context);
     owlContextSetRayTypeCount(OD.context, 2);
     cudaSetDevice(0); // OWL leaves the device as num_devices - 1 after the context is created. set it back to 0.
-    OD.module = moduleCreate(OD.context, ptxCode);
+    OD.module = owlModuleCreate(OD.context, ptxCode);
     
     /* Setup Optix Launch Params */
     OWLVarDecl launchParamVars[] = {
@@ -626,21 +570,12 @@ void initializeOptix(bool headless)
         initializeFrameBuffer(512, 512);        
     }
 
-    if (numGPUsFound > 1) {
-        OD.frameBuffer = managedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.accumBuffer = managedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.normalBuffer = managedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.albedoBuffer = managedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.scratchBuffer = managedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.mvecBuffer = managedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    } else {
-        OD.frameBuffer = deviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.accumBuffer = deviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.normalBuffer = deviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.albedoBuffer = deviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.scratchBuffer = deviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-        OD.mvecBuffer = deviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    }
+    OD.frameBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.accumBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.normalBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.albedoBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.scratchBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.mvecBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
     OD.LP.frameSize = glm::ivec2(512, 512);
     launchParamsSetBuffer(OD.launchParams, "frameBuffer", OD.frameBuffer);
     launchParamsSetBuffer(OD.launchParams, "normalBuffer", OD.normalBuffer);
@@ -652,24 +587,24 @@ void initializeOptix(bool headless)
 
     /* Create Component Buffers */
     // note, extra textures reserved for internal use
-    OD.entityBuffer              = deviceBufferCreate(OD.context, OWL_USER_TYPE(EntityStruct),        Entity::getCount(),   nullptr);
-    OD.transformBuffer           = deviceBufferCreate(OD.context, OWL_USER_TYPE(TransformStruct),     Transform::getCount(), nullptr);
-    OD.cameraBuffer              = deviceBufferCreate(OD.context, OWL_USER_TYPE(CameraStruct),        Camera::getCount(),    nullptr);
-    OD.materialBuffer            = deviceBufferCreate(OD.context, OWL_USER_TYPE(MaterialStruct),      Material::getCount(),  nullptr);
-    OD.meshBuffer                = deviceBufferCreate(OD.context, OWL_USER_TYPE(MeshStruct),          Mesh::getCount(),     nullptr);
-    OD.lightBuffer               = deviceBufferCreate(OD.context, OWL_USER_TYPE(LightStruct),         Light::getCount(),     nullptr);
-    OD.textureBuffer             = deviceBufferCreate(OD.context, OWL_USER_TYPE(TextureStruct),       Texture::getCount() + NUM_MAT_PARAMS * Material::getCount(),   nullptr);
-    OD.volumeBuffer              = deviceBufferCreate(OD.context, OWL_USER_TYPE(VolumeStruct),        Volume::getCount(),   nullptr);
-    OD.volumeHandlesBuffer       = deviceBufferCreate(OD.context, OWL_BUFFER,                         Volume::getCount(),   nullptr);
-    OD.lightEntitiesBuffer       = deviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
-    OD.surfaceInstanceToEntityBuffer = deviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
-    OD.volumeInstanceToEntityBuffer = deviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
-    OD.vertexListsBuffer         = deviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
-    OD.normalListsBuffer         = deviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
-    OD.tangentListsBuffer        = deviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
-    OD.texCoordListsBuffer       = deviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
-    OD.indexListsBuffer          = deviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
-    OD.textureObjectsBuffer      = deviceBufferCreate(OD.context, OWL_TEXTURE,                        Texture::getCount() + NUM_MAT_PARAMS * Material::getCount(),   nullptr);
+    OD.entityBuffer              = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(EntityStruct),        Entity::getCount(),   nullptr);
+    OD.transformBuffer           = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(TransformStruct),     Transform::getCount(), nullptr);
+    OD.cameraBuffer              = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(CameraStruct),        Camera::getCount(),    nullptr);
+    OD.materialBuffer            = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(MaterialStruct),      Material::getCount(),  nullptr);
+    OD.meshBuffer                = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(MeshStruct),          Mesh::getCount(),     nullptr);
+    OD.lightBuffer               = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(LightStruct),         Light::getCount(),     nullptr);
+    OD.textureBuffer             = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(TextureStruct),       Texture::getCount() + NUM_MAT_PARAMS * Material::getCount(),   nullptr);
+    OD.volumeBuffer              = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(VolumeStruct),        Volume::getCount(),   nullptr);
+    OD.volumeHandlesBuffer       = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Volume::getCount(),   nullptr);
+    OD.lightEntitiesBuffer       = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
+    OD.surfaceInstanceToEntityBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
+    OD.volumeInstanceToEntityBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
+    OD.vertexListsBuffer         = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
+    OD.normalListsBuffer         = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
+    OD.tangentListsBuffer        = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
+    OD.texCoordListsBuffer       = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
+    OD.indexListsBuffer          = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
+    OD.textureObjectsBuffer      = owlDeviceBufferCreate(OD.context, OWL_TEXTURE,                        Texture::getCount() + NUM_MAT_PARAMS * Material::getCount(),   nullptr);
 
     launchParamsSetBuffer(OD.launchParams, "entities",             OD.entityBuffer);
     launchParamsSetBuffer(OD.launchParams, "transforms",           OD.transformBuffer);
@@ -784,15 +719,15 @@ void initializeOptix(bool headless)
     OD.rayGen = rayGenCreate(OD.context,OD.module,"rayGen", sizeof(RayGenData), rayGenVars,-1);
     owlRayGenSet1i(OD.rayGen, "deviceCount",  numGPUsFound);
 
-    buildPrograms(OD.context);
+    owlBuildPrograms(OD.context);
     
     /* Temporary GAS. Required for certain older driver versions. */
     const int NUM_VERTICES = 1;
     vec3 vertices[NUM_VERTICES] = {{ 0.f, 0.f, 0.f }};
     const int NUM_INDICES = 1;
     ivec3 indices[NUM_INDICES] = {{ 0, 0, 0 }};    
-    OWLBuffer vertexBuffer = deviceBufferCreate(OD.context,OWL_FLOAT4,NUM_VERTICES,vertices);
-    OWLBuffer indexBuffer = deviceBufferCreate(OD.context,OWL_INT3,NUM_INDICES,indices);
+    OWLBuffer vertexBuffer = owlDeviceBufferCreate(OD.context,OWL_FLOAT4,NUM_VERTICES,vertices);
+    OWLBuffer indexBuffer = owlDeviceBufferCreate(OD.context,OWL_INT3,NUM_INDICES,indices);
     OWLGeom trianglesGeom = geomCreate(OD.context,OD.trianglesGeomType);
     trianglesSetVertices(trianglesGeom,vertexBuffer,NUM_VERTICES,sizeof(vec4),0);
     trianglesSetIndices(trianglesGeom,indexBuffer, NUM_INDICES,sizeof(ivec3),0);
@@ -819,8 +754,8 @@ void initializeOptix(bool headless)
     launchParamsSetGroup(OD.launchParams, "volumesIAS", volumesIAS);
 
     // Build *SBT* required to trace the groups   
-    buildPipeline(OD.context);
-    buildSBT(OD.context);
+    owlBuildPipeline(OD.context);
+    owlBuildSBT(OD.context);
 
     // Setup denoiser
     configureDenoiser(OD.enableAlbedoGuide, OD.enableNormalGuide, OD.enableKernelPrediction);
@@ -1222,11 +1157,11 @@ void updateComponents()
             if (m->getTriangleIndices().size() == 0) throw std::runtime_error("ERROR: indices is 0");
 
             // Next, allocate resources for the new mesh.
-            OD.vertexLists[m->getAddress()]  = deviceBufferCreate(OD.context, OWL_USER_TYPE(vec3), m->getVertices().size(), m->getVertices().data());
-            OD.normalLists[m->getAddress()]   = deviceBufferCreate(OD.context, OWL_USER_TYPE(vec4), m->getNormals().size(), m->getNormals().data());
-            OD.tangentLists[m->getAddress()]   = deviceBufferCreate(OD.context, OWL_USER_TYPE(vec4), m->getTangents().size(), m->getTangents().data());
-            OD.texCoordLists[m->getAddress()] = deviceBufferCreate(OD.context, OWL_USER_TYPE(vec2), m->getTexCoords().size(), m->getTexCoords().data());
-            OD.indexLists[m->getAddress()]   = deviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t), m->getTriangleIndices().size(), m->getTriangleIndices().data());
+            OD.vertexLists[m->getAddress()]  = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(vec3), m->getVertices().size(), m->getVertices().data());
+            OD.normalLists[m->getAddress()]   = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(vec4), m->getNormals().size(), m->getNormals().data());
+            OD.tangentLists[m->getAddress()]   = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(vec4), m->getTangents().size(), m->getTangents().data());
+            OD.texCoordLists[m->getAddress()] = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(vec2), m->getTexCoords().size(), m->getTexCoords().data());
+            OD.indexLists[m->getAddress()]   = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t), m->getTriangleIndices().size(), m->getTriangleIndices().data());
             
             // Create geometry and build BLAS
             OD.surfaceGeomList[m->getAddress()] = geomCreate(OD.context, OD.trianglesGeomType);
@@ -1236,13 +1171,13 @@ void updateComponents()
             groupBuildAccel(OD.surfaceBlasList[m->getAddress()]);          
         }
 
-        bufferUpload(OD.vertexListsBuffer, OD.vertexLists.data());
-        bufferUpload(OD.texCoordListsBuffer, OD.texCoordLists.data());
-        bufferUpload(OD.indexListsBuffer, OD.indexLists.data());
-        bufferUpload(OD.normalListsBuffer, OD.normalLists.data());
-        bufferUpload(OD.tangentListsBuffer, OD.tangentLists.data());
+        owlBufferUpload(OD.vertexListsBuffer, OD.vertexLists.data());
+        owlBufferUpload(OD.texCoordListsBuffer, OD.texCoordLists.data());
+        owlBufferUpload(OD.indexListsBuffer, OD.indexLists.data());
+        owlBufferUpload(OD.normalListsBuffer, OD.normalLists.data());
+        owlBufferUpload(OD.tangentListsBuffer, OD.tangentLists.data());
         Mesh::updateComponents();
-        bufferUpload(OptixData.meshBuffer, Mesh::getFrontStruct());
+        owlBufferUpload(OptixData.meshBuffer, Mesh::getFrontStruct());
     }    
 
     // Manage Volumes: Build / Rebuild BLAS
@@ -1394,8 +1329,8 @@ void updateComponents()
             }            
             owlInstanceGroupSetTransforms(OD.surfacesIAS,0,(const float*)t0OwlSurfaceTransforms.data());
             owlInstanceGroupSetTransforms(OD.surfacesIAS,1,(const float*)t1OwlSurfaceTransforms.data());
-            bufferResize(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.size());
-            bufferUpload(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.data());
+            owlBufferResize(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.size());
+            owlBufferUpload(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.data());
         }       
 
         // Set volume transforms to IAS, upload volume instance to entity map
@@ -1408,8 +1343,8 @@ void updateComponents()
             }            
             owlInstanceGroupSetTransforms(OD.volumesIAS,0,(const float*)t0OwlVolumeTransforms.data());
             owlInstanceGroupSetTransforms(OD.volumesIAS,1,(const float*)t1OwlVolumeTransforms.data());
-            bufferResize(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.size());
-            bufferUpload(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.data());
+            owlBufferResize(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.size());
+            owlBufferUpload(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.data());
         }
 
         // Build IAS
@@ -1419,7 +1354,7 @@ void updateComponents()
         launchParamsSetGroup(OD.launchParams, "surfacesIAS", OD.surfacesIAS);
 
         // Now that IAS have changed, we need to rebuild SBT
-        buildSBT(OD.context);
+        owlBuildSBT(OD.context);
 
         // Release any old IAS (TODO, don't rebuild if entity edit doesn't effect IAS...)
         if (oldSurfaceIAS) {owlGroupRelease(oldSurfaceIAS);}
@@ -1434,14 +1369,14 @@ void updateComponents()
             if (!entities[eid].getMesh()) continue;
             OD.lightEntities.push_back(eid);
         }
-        bufferResize(OptixData.lightEntitiesBuffer, OD.lightEntities.size());
-        bufferUpload(OptixData.lightEntitiesBuffer, OD.lightEntities.data());
+        owlBufferResize(OptixData.lightEntitiesBuffer, OD.lightEntities.size());
+        owlBufferUpload(OptixData.lightEntitiesBuffer, OD.lightEntities.data());
         OD.LP.numLightEntities = uint32_t(OD.lightEntities.size());
         launchParamsSetRaw(OD.launchParams, "numLightEntities", &OD.LP.numLightEntities);
 
         // Finally, upload entity structs to the GPU.
         Entity::updateComponents();
-        bufferUpload(OptixData.entityBuffer,    Entity::getFrontStruct());
+        owlBufferUpload(OptixData.entityBuffer,    Entity::getFrontStruct());
     }
 
     // Manage textures and materials
@@ -1568,13 +1503,13 @@ void updateComponents()
             }
 
             Material::updateComponents();
-            bufferUpload(OptixData.materialBuffer, OptixData.materialStructs.data());
+            owlBufferUpload(OptixData.materialBuffer, OptixData.materialStructs.data());
         }
         
-        bufferUpload(OD.textureObjectsBuffer, OD.textureObjects.data());
+        owlBufferUpload(OD.textureObjectsBuffer, OD.textureObjects.data());
         Texture::updateComponents();
         memcpy(OptixData.textureStructs.data(), Texture::getFrontStruct(), Texture::getCount() * sizeof(TextureStruct));
-        bufferUpload(OptixData.textureBuffer, OptixData.textureStructs.data());
+        owlBufferUpload(OptixData.textureBuffer, OptixData.textureStructs.data());
     }
     
     // Manage transforms
@@ -1602,13 +1537,13 @@ void updateComponents()
     // Manage Cameras
     if (Camera::areAnyDirty()) {
         Camera::updateComponents();
-        bufferUpload(OptixData.cameraBuffer,    Camera::getFrontStruct());
+        owlBufferUpload(OptixData.cameraBuffer,    Camera::getFrontStruct());
     }    
 
     // Manage lights
     if (Light::areAnyDirty()) {
         Light::updateComponents();
-        bufferUpload(OptixData.lightBuffer,     Light::getFrontStruct());
+        owlBufferUpload(OptixData.lightBuffer,     Light::getFrontStruct());
     }
 }
 
@@ -1644,9 +1579,9 @@ void denoiseImage() {
     synchronizeDevices();
 
     auto &OD = OptixData;
-    auto cudaStream = getStream(OD.context, 0);
+    auto cudaStream = owlContextGetStream(OD.context, 0);
 
-    CUdeviceptr frameBuffer = (CUdeviceptr) bufferGetPointer(OD.frameBuffer, 0);
+    CUdeviceptr frameBuffer = (CUdeviceptr) owlBufferGetPointer(OD.frameBuffer, 0);
 
     std::vector<OptixImage2D> inputLayers;
     OptixImage2D colorLayer;
@@ -1655,7 +1590,7 @@ void denoiseImage() {
     colorLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;
     colorLayer.pixelStrideInBytes = 4 * sizeof(float);
     colorLayer.rowStrideInBytes   = OD.LP.frameSize.x * 4 * sizeof(float);
-    colorLayer.data   = (CUdeviceptr) bufferGetPointer(OD.frameBuffer, 0);
+    colorLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.frameBuffer, 0);
     inputLayers.push_back(colorLayer);
 
     OptixImage2D albedoLayer;
@@ -1664,7 +1599,7 @@ void denoiseImage() {
     albedoLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;
     albedoLayer.pixelStrideInBytes = 4 * sizeof(float);
     albedoLayer.rowStrideInBytes   = OD.LP.frameSize.x * 4 * sizeof(float);
-    albedoLayer.data   = (CUdeviceptr) bufferGetPointer(OD.albedoBuffer, 0);
+    albedoLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.albedoBuffer, 0);
     if (OD.enableAlbedoGuide) inputLayers.push_back(albedoLayer);
 
     OptixImage2D normalLayer;
@@ -1673,7 +1608,7 @@ void denoiseImage() {
     normalLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;
     normalLayer.pixelStrideInBytes = 4 * sizeof(float);
     normalLayer.rowStrideInBytes   = OD.LP.frameSize.x * 4 * sizeof(float);
-    normalLayer.data   = (CUdeviceptr) bufferGetPointer(OD.normalBuffer, 0);
+    normalLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.normalBuffer, 0);
     if (OD.enableNormalGuide) inputLayers.push_back(normalLayer);
 
     OptixImage2D outputLayer = colorLayer; // can I get away with this?
@@ -1692,8 +1627,8 @@ void denoiseImage() {
             OD.denoiser, 
             cudaStream, 
             &inputLayers[0], 
-            (CUdeviceptr) bufferGetPointer(OD.hdrIntensityBuffer, 0),
-            (CUdeviceptr) bufferGetPointer(OD.denoiserScratchBuffer, 0),
+            (CUdeviceptr) owlBufferGetPointer(OD.hdrIntensityBuffer, 0),
+            (CUdeviceptr) owlBufferGetPointer(OD.denoiserScratchBuffer, 0),
             scratchSizeInBytes));
     }
 
@@ -1703,31 +1638,31 @@ void denoiseImage() {
             OD.denoiser, 
             cudaStream, 
             &inputLayers[0], 
-            (CUdeviceptr) bufferGetPointer(OD.colorAvgBuffer, 0),
-            (CUdeviceptr) bufferGetPointer(OD.denoiserScratchBuffer, 0),
+            (CUdeviceptr) owlBufferGetPointer(OD.colorAvgBuffer, 0),
+            (CUdeviceptr) owlBufferGetPointer(OD.denoiserScratchBuffer, 0),
             scratchSizeInBytes));
     }
     #endif
 
     params.denoiseAlpha = 0;    // Don't touch alpha.
     params.blendFactor  = 0.0f; // Show the denoised image only.
-    params.hdrIntensity = (CUdeviceptr) bufferGetPointer(OD.hdrIntensityBuffer, 0);
+    params.hdrIntensity = (CUdeviceptr) owlBufferGetPointer(OD.hdrIntensityBuffer, 0);
     #ifdef USE_OPTIX72
-    params.hdrAverageColor = (CUdeviceptr) bufferGetPointer(OD.colorAvgBuffer, 0);
+    params.hdrAverageColor = (CUdeviceptr) owlBufferGetPointer(OD.colorAvgBuffer, 0);
     #endif
     
     OPTIX_CHECK(optixDenoiserInvoke(
         OD.denoiser,
         cudaStream,
         &params,
-        (CUdeviceptr) bufferGetPointer(OD.denoiserStateBuffer, 0),
+        (CUdeviceptr) owlBufferGetPointer(OD.denoiserStateBuffer, 0),
         OD.denoiserSizes.stateSizeInBytes,
         inputLayers.data(),
         inputLayers.size(),
         /* inputOffsetX */ 0,
         /* inputOffsetY */ 0,
         &outputLayer,
-        (CUdeviceptr) bufferGetPointer(OD.denoiserScratchBuffer, 0),
+        (CUdeviceptr) owlBufferGetPointer(OD.denoiserScratchBuffer, 0),
         scratchSizeInBytes
     ));
 
@@ -1741,7 +1676,7 @@ void drawFrameBufferToWindow()
 
     auto &OD = OptixData;
     cudaGraphicsMapResources(1, &OD.cudaResourceTex);
-    const void* fbdevptr = bufferGetPointer(OD.frameBuffer,0);
+    const void* fbdevptr = owlBufferGetPointer(OD.frameBuffer,0);
     cudaArray_t array;
     cudaGraphicsSubResourceGetMappedArray(&array, OD.cudaResourceTex, 0, 0);
     cudaMemcpyToArray(array, 0, 0, fbdevptr, OD.LP.frameSize.x *  OD.LP.frameSize.y  * sizeof(glm::vec4), cudaMemcpyDeviceToDevice);
@@ -1857,9 +1792,9 @@ void configureDenoiser(bool useAlbedoGuide, bool useNormalGuide, bool useKernelP
         if (!OptixData.colorAvgBuffer)
             OptixData.colorAvgBuffer = owlDeviceBufferCreate(OptixData.context, OWL_USER_TYPE(float), 4, nullptr);
         if (!OptixData.denoiserScratchBuffer)
-            OptixData.denoiserScratchBuffer = deviceBufferCreate(OptixData.context, OWL_USER_TYPE(void*), 1, nullptr);
+            OptixData.denoiserScratchBuffer = owlDeviceBufferCreate(OptixData.context, OWL_USER_TYPE(void*), 1, nullptr);
         if (!OptixData.denoiserStateBuffer)
-            OptixData.denoiserStateBuffer = deviceBufferCreate(OptixData.context, OWL_USER_TYPE(void*), 1, nullptr);
+            OptixData.denoiserStateBuffer = owlDeviceBufferCreate(OptixData.context, OWL_USER_TYPE(void*), 1, nullptr);
         
         // Setup denoiser
         OptixDenoiserOptions options;
@@ -1873,8 +1808,8 @@ void configureDenoiser(bool useAlbedoGuide, bool useNormalGuide, bool useKernelP
 
         if (OptixData.denoiser) optixDenoiserDestroy(OptixData.denoiser);
         
-        auto optixContext = getOptixContext(OptixData.context, 0);
-        auto cudaStream = getStream(OptixData.context, 0);
+        auto optixContext = owlContextGetOptixContext(OptixData.context, 0);
+        auto cudaStream = owlContextGetStream(OptixData.context, 0);
         OPTIX_CHECK(optixDenoiserCreate(optixContext, &options, &OptixData.denoiser));
 
         OptixDenoiserModelKind kind;
@@ -1907,9 +1842,9 @@ void configureDenoiser(bool useAlbedoGuide, bool useNormalGuide, bool useKernelP
             (cudaStream_t) cudaStream, 
             (unsigned int) OptixData.LP.frameSize.x, 
             (unsigned int) OptixData.LP.frameSize.y, 
-            (CUdeviceptr) bufferGetPointer(OptixData.denoiserStateBuffer, 0), 
+            (CUdeviceptr) owlBufferGetPointer(OptixData.denoiserStateBuffer, 0), 
             OptixData.denoiserSizes.stateSizeInBytes,
-            (CUdeviceptr) bufferGetPointer(OptixData.denoiserScratchBuffer, 0), 
+            (CUdeviceptr) owlBufferGetPointer(OptixData.denoiserScratchBuffer, 0), 
             scratchSizeInBytes
         );
     });
@@ -1922,7 +1857,7 @@ std::vector<float> readFrameBuffer() {
         int num_devices = getDeviceCount();
         synchronizeDevices();
 
-        const glm::vec4 *fb = (const glm::vec4*)bufferGetPointer(OptixData.frameBuffer,0);
+        const glm::vec4 *fb = (const glm::vec4*)owlBufferGetPointer(OptixData.frameBuffer,0);
         for (uint32_t test = 0; test < frameBuffer.size(); test += 4) {
             frameBuffer[test + 0] = fb[test / 4].r;
             frameBuffer[test + 1] = fb[test / 4].g;
@@ -1995,7 +1930,7 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
 
         synchronizeDevices();
 
-        const glm::vec4 *fb = (const glm::vec4*) bufferGetPointer(OptixData.frameBuffer,0);
+        const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
         cudaMemcpyAsync(frameBuffer.data(), fb, width * height * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
 
         synchronizeDevices();
@@ -2127,7 +2062,7 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
 
         synchronizeDevices();
 
-        const glm::vec4 *fb = (const glm::vec4*) bufferGetPointer(OptixData.frameBuffer,0);
+        const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
         cudaMemcpyAsync(frameBuffer.data(), fb, width * height * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
 
         OptixData.LP.renderDataMode = 0;
@@ -2418,12 +2353,12 @@ void initializeInteractive(
                     denoiseImage();
                 }        
             }
-            // glm::vec4* samplePtr = (glm::vec4*) bufferGetPointer(OptixData.accumBuffer,0);
-            // glm::vec4* mvecPtr = (glm::vec4*) bufferGetPointer(OptixData.mvecBuffer,0);
-            // glm::vec4* t0AlbPtr = (glm::vec4*) bufferGetPointer(OptixData.scratchBuffer,0);
-            // glm::vec4* t1AlbPtr = (glm::vec4*) bufferGetPointer(OptixData.albedoBuffer,0);
-            // glm::vec4* fbPtr = (glm::vec4*) bufferGetPointer(OptixData.frameBuffer,0);
-            // glm::vec4* sPtr = (glm::vec4*) bufferGetPointer(OptixData.normalBuffer,0);
+            // glm::vec4* samplePtr = (glm::vec4*) owlBufferGetPointer(OptixData.accumBuffer,0);
+            // glm::vec4* mvecPtr = (glm::vec4*) owlBufferGetPointer(OptixData.mvecBuffer,0);
+            // glm::vec4* t0AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.scratchBuffer,0);
+            // glm::vec4* t1AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.albedoBuffer,0);
+            // glm::vec4* fbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
+            // glm::vec4* sPtr = (glm::vec4*) owlBufferGetPointer(OptixData.normalBuffer,0);
             // int width = OptixData.LP.frameSize.x;
             // int height = OptixData.LP.frameSize.y;
             // reproject(samplePtr, t0AlbPtr, t1AlbPtr, mvecPtr, sPtr, fbPtr, width, height);

From 0c46d4d6e9336bbbd39c2e524cc003ac19cae626 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 17 Feb 2021 00:39:45 -0700
Subject: [PATCH 02/55] some fixes for multigpu setups. Some issues with
 different gpus rendering different images:

---
 src/nvisii/nvisii.cpp | 92 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 88 insertions(+), 4 deletions(-)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 0d635a38..3f415090 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1575,6 +1575,80 @@ void updateLaunchParams()
     OptixData.LP.frameID ++;
 }
 
+// Different GPUs have different local framebuffers.
+// This function combines those framebuffers on the CPU, then uploads results to device 0.
+void mergeFrameBuffers() {
+    int deviceCount = getDeviceCount();
+    int width = OptixData.LP.frameSize.x;
+    int height = OptixData.LP.frameSize.y;
+    if (deviceCount <= 1) return;
+
+    // synchronizeDevices();
+
+    std::vector<glm::vec4> fb_h(width * height);
+    std::vector<glm::vec4> fba_h(width * height);
+    std::vector<glm::vec4> fbn_h(width * height);
+    std::vector<std::vector<glm::vec4>> fb_hd(deviceCount);
+    std::vector<std::vector<glm::vec4>> fba_hd(deviceCount);
+    std::vector<std::vector<glm::vec4>> fbn_hd(deviceCount);
+    for (uint32_t i = 0; i < deviceCount; ++i){
+        fb_hd[i] = std::vector<glm::vec4>(width * height);
+        fba_hd[i] = std::vector<glm::vec4>(width * height);
+        fbn_hd[i] = std::vector<glm::vec4>(width * height);
+        void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,i);
+        void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,i);
+        void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,i);
+        cudaMemcpyAsync((void*)fb_hd[i].data(), (void*)fb_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+        cudaMemcpyAsync((void*)fba_hd[i].data(), (void*)fba_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+        cudaMemcpyAsync((void*)fbn_hd[i].data(), (void*)fbn_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+    }
+    // synchronizeDevices();
+
+    // note, GPUs render 32xN strips
+    for (uint32_t y = 0; y < height; ++y) {
+        for (uint32_t x = 0; x < width; x += 32) {
+            if (x >= width) continue;
+            int deviceThatIsResponsible = (x>>5) % deviceCount;
+            {
+                glm::vec4* A = fb_h.data() + (y * width) + x;
+                glm::vec4* B = fb_hd[deviceThatIsResponsible].data() + (y * width) + x;
+                memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
+            }
+            {
+                glm::vec4* A = fba_h.data() + (y * width) + x;
+                glm::vec4* B = fba_hd[deviceThatIsResponsible].data() + (y * width) + x;
+                memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
+            }
+            {
+                glm::vec4* A = fbn_h.data() + (y * width) + x;
+                glm::vec4* B = fbn_hd[deviceThatIsResponsible].data() + (y * width) + x;
+                memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
+            }
+        }
+    }
+
+    // // note, GPUs render 32xN strips
+    // for (uint32_t y = 0; y < height; ++y) {
+    //     for (uint32_t x = 0; x < width; x += 32) {
+    //         int deviceThatIsResponsible = (x>>5) % deviceCount;
+    //         glm::vec4* A = fb_h.data() + (y * width) + x;
+    //         glm::vec4* B = ((glm::vec4*)fb_d[deviceThatIsResponsible]) + (y * width) + x;
+    //         cudaMemcpyAsync((void*)A, (void*)B, min(32, int(width - x)) * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+    //     }
+    // }
+
+
+    // cudaMemcpyAsync(fb_h.data(), fb_d[1], fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+    synchronizeDevices();
+    void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,0);
+    void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,0);
+    void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,0);
+    cudaMemcpyAsync(fb_d, fb_h.data(), fb_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(fba_d, fba_h.data(), fba_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(fbn_d, fbn_h.data(), fbn_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
+    // synchronizeDevices();
+}
+
 void denoiseImage() {
     synchronizeDevices();
 
@@ -1903,12 +1977,14 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
 
             updateLaunchParams();
             owlLaunch2D(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, OptixData.launchParams);
-            if (OptixData.enableDenoiser)
-            {
-                denoiseImage();
-            }
 
             if (!NVISII.headlessMode) {
+                mergeFrameBuffers();
+                if (OptixData.enableDenoiser)
+                {
+                    denoiseImage();
+                }
+
                 drawFrameBufferToWindow();
                 glfwSetWindowTitle(WindowData.window, 
                     (std::to_string(i) + std::string("/") + std::to_string(samplesPerPixel)).c_str());
@@ -1930,6 +2006,12 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
 
         synchronizeDevices();
 
+        mergeFrameBuffers();
+        if (OptixData.enableDenoiser)
+        {
+            denoiseImage();
+        }
+
         const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
         cudaMemcpyAsync(frameBuffer.data(), fb, width * height * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
 
@@ -2049,6 +2131,7 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
 
             updateLaunchParams();
             owlLaunch2D(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, OptixData.launchParams);
+            mergeFrameBuffers();
             // Dont run denoiser to raw data rendering
             // if (OptixData.enableDenoiser)
             // {
@@ -2348,6 +2431,7 @@ void initializeInteractive(
                 updateComponents();
                 updateLaunchParams();
                 owlLaunch2D(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, OptixData.launchParams);
+                mergeFrameBuffers();
                 if (OptixData.enableDenoiser)
                 {
                     denoiseImage();

From 81291c7c1602107b9e0b3ea66bddb465ce2842f1 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 20 Feb 2021 12:12:34 -0700
Subject: [PATCH 03/55] changing how max path bounces are handled to avoid long
 tails.

---
 src/nvisii/devicecode/path_tracer.cu | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index ed8344db..63b09de8 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -940,6 +940,8 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     uint8_t transparencyDepth = 0;
     uint8_t transmissionDepth = 0;
     uint8_t volumeDepth = 0;
+    int sampledBsdf = -1;
+    bool useBRDF = true;
 
     // direct here is used for final image clamping
     float3 directIllum = make_float3(0.f);
@@ -1214,7 +1216,6 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         float3 irradiance = make_float3(0.f);
 
         // If we hit a volume, use hybrid scattering to determine whether or not to use a BRDF or a phase function.
-        bool useBRDF = true;
         if (volPayload.tHit >= 0.f) {
             float opacity = mat.alpha; // would otherwise be sampled from a transfer function
             float grad_len = uv.y;
@@ -1232,7 +1233,6 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         // First, sample the BRDF / phase function so that we can use the sampled direction for MIS
         float3 w_i;
         float bsdfPDF;
-        int sampledBsdf = -1;
         float3 bsdf;
         if (useBRDF) {
             sample_disney_brdf(
@@ -1554,11 +1554,14 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         // of terminating, just so that we don't get black regions in our glass
         if (transmissionDepth >= LP.maxTransmissionDepth) continue;
     } while (
-        diffuseDepth < LP.maxDiffuseDepth && 
-        glossyDepth < LP.maxGlossyDepth && 
-        // transmissionDepth < LP.maxTransmissionDepth &&  // see comment above
-        transparencyDepth < LP.maxTransparencyDepth && 
-        volumeDepth < LP.maxVolumeDepth
+        // Terminate the path if the sampled BRDF's corresponding bounce depth exceeds the max bounce for that bounce type minus the overall path depth.
+        // This prevents long tails that can otherwise occur from mixing BRDF events
+        (!(sampledBsdf == DISNEY_DIFFUSE_BRDF && diffuseDepth > (LP.maxDiffuseDepth - (depth - 1)))) &&
+        (!(sampledBsdf == DISNEY_GLOSSY_BRDF && glossyDepth > LP.maxGlossyDepth - (depth - 1)) ) &&
+        (!(sampledBsdf == DISNEY_CLEARCOAT_BRDF && glossyDepth > LP.maxGlossyDepth - (depth - 1)) ) &&
+        (!(useBRDF == false && volumeDepth > LP.maxVolumeDepth - (depth - 1))) &&
+        (!(transparencyDepth > LP.maxTransparencyDepth - (depth - 1)))
+        // (!(sampledBsdf == DISNEY_TRANSMISSION_BRDF && transmissionDepth < LP.maxTransmissionDepth - (depth - 1)) ) && // see comment above
     );   
 
     // For segmentations, save heatmap metadata

From f57bcc6b6210b6d9d6d9d097fc874f2530c36bb7 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Fri, 26 Feb 2021 14:18:48 -0700
Subject: [PATCH 04/55] working on multigpu perf

---
 CMakeLists.txt                               |   3 +
 include/nvisii/utilities/CMakeLists.txt      |   1 +
 include/nvisii/utilities/work_distribution.h |  74 ++++
 src/nvisii/devicecode/buffer.h               |   2 +-
 src/nvisii/devicecode/launch_params.h        |   4 +-
 src/nvisii/devicecode/path_tracer.cu         |  71 ++--
 src/nvisii/nvisii.cpp                        | 403 ++++++++++---------
 src/nvisii/nvisii.cu                         |  37 +-
 8 files changed, 359 insertions(+), 236 deletions(-)
 create mode 100644 include/nvisii/utilities/work_distribution.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9055fc1e..6b86d441 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -293,6 +293,9 @@ find_program(BIN2C bin2c
   /usr/local/cuda/bin)
 
 # optix 7
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/include/nvisii/utilities/sutil/)
+include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/include/nvisii/utilities/)
+
 if ($ENV{OPTIX_VERSION}) 
 set(OPTION_OPTIX_VERSION $ENV{OPTIX_VERSION})
 else()
diff --git a/include/nvisii/utilities/CMakeLists.txt b/include/nvisii/utilities/CMakeLists.txt
index ee81cbbe..84d88ba4 100644
--- a/include/nvisii/utilities/CMakeLists.txt
+++ b/include/nvisii/utilities/CMakeLists.txt
@@ -14,4 +14,5 @@ set(Utilities_HDR
 	${CMAKE_CURRENT_SOURCE_DIR}/singleton.h
 	${CMAKE_CURRENT_SOURCE_DIR}/version.h
 	${CMAKE_CURRENT_SOURCE_DIR}/procedural_sky.h
+	${CMAKE_CURRENT_SOURCE_DIR}/work_distribution.h
 	PARENT_SCOPE)
diff --git a/include/nvisii/utilities/work_distribution.h b/include/nvisii/utilities/work_distribution.h
new file mode 100644
index 00000000..5f1c5a56
--- /dev/null
+++ b/include/nvisii/utilities/work_distribution.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#if defined(__CUDACC__) || defined(__CUDABE__)
+#    define SWD_HOSTDEVICE __host__ __device__
+#    define SWD_INLINE __forceinline__
+#    define CONST_STATIC_INIT( ... )
+#else
+#    define SWD_HOSTDEVICE
+#    define SWD_INLINE inline
+#    define CONST_STATIC_INIT( ... ) = __VA_ARGS__
+#endif
+
+#include <stdint.h>
+
+#include <vector_types.h>
+
+class StaticWorkDistribution
+{
+public:
+    SWD_INLINE SWD_HOSTDEVICE void setRasterSize( int width, int height )
+    {
+        m_width = width;
+        m_height = height;
+    }
+
+
+    SWD_INLINE SWD_HOSTDEVICE void setNumGPUs( int32_t num_gpus )
+    {
+        m_num_gpus = num_gpus;
+    }
+
+
+    SWD_INLINE SWD_HOSTDEVICE int32_t numSamples( )
+    {
+        const int tile_strip_width  = TILE_WIDTH*m_num_gpus;
+        const int tile_strip_height = TILE_HEIGHT;
+        const int num_tile_strip_cols = m_width /tile_strip_width  + ( m_width %tile_strip_width  == 0 ? 0 : 1 );
+        const int num_tile_strip_rows = m_height/tile_strip_height + ( m_height%tile_strip_height == 0 ? 0 : 1 );
+        return num_tile_strip_rows*num_tile_strip_cols*TILE_WIDTH*TILE_HEIGHT;
+    }
+
+
+    SWD_INLINE SWD_HOSTDEVICE int2 getSamplePixel( int32_t gpu_idx, int32_t sample_idx )
+    {
+        const int tile_strip_width  = TILE_WIDTH*m_num_gpus;
+        const int tile_strip_height = TILE_HEIGHT;
+        const int num_tile_strip_cols = m_width /tile_strip_width + ( m_width % tile_strip_width == 0 ? 0 : 1 );
+
+        const int tile_strip_idx     = sample_idx / (TILE_WIDTH*TILE_HEIGHT );
+        const int tile_strip_y       = tile_strip_idx / num_tile_strip_cols;
+        const int tile_strip_x       = tile_strip_idx - tile_strip_y * num_tile_strip_cols;
+        const int tile_strip_x_start = tile_strip_x * tile_strip_width;
+        const int tile_strip_y_start = tile_strip_y * tile_strip_height;
+
+        const int tile_pixel_idx     = sample_idx - ( tile_strip_idx * TILE_WIDTH*TILE_HEIGHT );
+        const int tile_pixel_y       = tile_pixel_idx / TILE_WIDTH;
+        const int tile_pixel_x       = tile_pixel_idx - tile_pixel_y * TILE_WIDTH;
+
+        const int tile_offset_x      = ( gpu_idx + tile_strip_y % m_num_gpus ) % m_num_gpus * TILE_WIDTH;
+
+        const int pixel_y = tile_strip_y_start + tile_pixel_y;
+        const int pixel_x = tile_strip_x_start + tile_pixel_x + tile_offset_x ;
+        return make_int2( pixel_x, pixel_y );
+    }
+
+
+private:
+    int32_t m_num_gpus = 0;
+    int32_t m_width    = 0;
+    int32_t m_height   = 0;
+
+    static const int32_t TILE_WIDTH  = 8;
+    static const int32_t TILE_HEIGHT = 4;
+};
diff --git a/src/nvisii/devicecode/buffer.h b/src/nvisii/devicecode/buffer.h
index 5464326e..8354d66f 100644
--- a/src/nvisii/devicecode/buffer.h
+++ b/src/nvisii/devicecode/buffer.h
@@ -35,4 +35,4 @@ class Buffer : public owl::device::Buffer
 #define GET(RETURN, TYPE, BUFFER, ADDRESS) \
 if (BUFFER.data == nullptr) {::printf("Device Side Error on Line %d: buffer was nullptr.\n", __LINE__); asm("trap;");} \
 if (ADDRESS >= BUFFER.count) {::printf("Device Side Error on Line %d: out of bounds access (address: %d, size %d).\n", __LINE__, ADDRESS, uint32_t(BUFFER.count)); asm("trap;");} \
-RETURN = ((TYPE*)BUFFER.data)[ADDRESS];\
+RETURN = ((TYPE*)BUFFER.data)[ADDRESS];
diff --git a/src/nvisii/devicecode/launch_params.h b/src/nvisii/devicecode/launch_params.h
index ea2c2d51..ea5edb0f 100644
--- a/src/nvisii/devicecode/launch_params.h
+++ b/src/nvisii/devicecode/launch_params.h
@@ -19,10 +19,12 @@
 #include "./buffer.h"
 
 struct LaunchParams {
+    Buffer<int2> sampleIndexBuffer;
+
     glm::ivec2 frameSize;
     uint64_t frameID = 0;
     glm::vec4 *frameBuffer;
-    glm::vec4 *albedoBuffer;
+    uchar4 *albedoBuffer;
     glm::vec4 *normalBuffer;
     glm::vec4 *scratchBuffer;
     glm::vec4 *mvecBuffer;
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index ed8344db..8d99d060 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -644,7 +644,7 @@ float sampleTime(float xi) {
 }
 
 inline __device__
-owl::Ray generateRay(const CameraStruct &camera, const TransformStruct &transform, ivec2 pixelID, ivec2 frameSize, LCGRand &rng, float time)
+owl::Ray generateRay(const CameraStruct &camera, const TransformStruct &transform, int2 pixelID, float2 frameSize, LCGRand &rng, float time)
 {
     auto &LP = optixLaunchParams;
     /* Generate camera rays */    
@@ -665,7 +665,7 @@ owl::Ray generateRay(const CameraStruct &camera, const TransformStruct &transfor
             -  vec2(LP.xPixelSamplingInterval[0], LP.yPixelSamplingInterval[0])
             ) * vec2(lcg_randomf(rng),lcg_randomf(rng));
 
-    vec2 inUV = (vec2(pixelID.x, pixelID.y) + aa) / vec2(frameSize);
+    vec2 inUV = (vec2(pixelID.x, pixelID.y) + aa) / make_vec2(frameSize);
     vec3 right = normalize(glm::column(viewinv, 0));
     vec3 up = normalize(glm::column(viewinv, 1));
     vec3 origin = glm::column(viewinv, 3);
@@ -885,24 +885,19 @@ bool debugging() {
 OPTIX_RAYGEN_PROGRAM(rayGen)()
 {
     const RayGenData &self = owl::getProgramData<RayGenData>();
-    cudaTextureObject_t envTex = getEnvironmentTexture();
-    
     auto &LP = optixLaunchParams;
     auto launchIndex = optixGetLaunchIndex().x;
     auto launchDim = optixGetLaunchDimensions().x;
-    auto pixelID = ivec2(launchIndex % LP.frameSize.x, launchIndex / LP.frameSize.x);
+    
+    GET(const int2 pixelID, int2, LP.sampleIndexBuffer, launchIndex);
+    
+    // Work distribution might assign tiles that cross over image boundary
+    if( pixelID.x > LP.frameSize.x-1 || pixelID.y > LP.frameSize.y-1 ) return;
+    
+    cudaTextureObject_t envTex = getEnvironmentTexture();    
     bool debug = (pixelID.x == int(LP.frameSize.x / 2) && pixelID.y == int(LP.frameSize.y / 2));
-
     float tmax = 1e20f; //todo: customize depending on scene bounds //glm::distance(LP.sceneBBMin, LP.sceneBBMax);
 
-    /* compute who is repsonible for a given group of pixels */
-    /* and if it's not us, just return. */
-    /* (some other device will compute these pixels) */
-    int deviceThatIsResponsible = (pixelID.x>>5) % self.deviceCount;
-    if (self.deviceIndex != deviceThatIsResponsible) {
-        return;
-    }
-
     auto dims = ivec2(LP.frameSize.x, LP.frameSize.x);
     uint64_t start_clock = clock();
     int numLights = LP.numLightEntities;
@@ -924,7 +919,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     }
     
     // Trace an initial ray through the scene
-    surfRay = generateRay(camera, camera_transform, pixelID, LP.frameSize, rng, time);
+    surfRay = generateRay(camera, camera_transform, pixelID, make_float2(LP.frameSize), rng, time);
     surfRay.tmax = tmax;
 
     float3 accum_illum = make_float3(0.f);
@@ -1593,12 +1588,12 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     auto fbOfs = pixelID.x+LP.frameSize.x * ((LP.frameSize.y - 1) -  pixelID.y);
     float4* accumPtr = (float4*) LP.accumPtr;
     float4* fbPtr = (float4*) LP.frameBuffer;
-    float4* normalPtr = (float4*) LP.normalBuffer;
-    float4* albedoPtr = (float4*) LP.albedoBuffer;
+    // float4* normalPtr = (float4*) LP.normalBuffer;
+    // float4* albedoPtr = (float4*) LP.albedoBuffer;
 
     float4 prev_color = accumPtr[fbOfs];
-    float4 prev_normal = normalPtr[fbOfs];
-    float4 prev_albedo = albedoPtr[fbOfs];
+    // float4 prev_normal = normalPtr[fbOfs];
+    // float4 prev_albedo = albedoPtr[fbOfs];
     float4 accum_color;
 
     if (LP.renderDataMode == RenderDataFlags::NONE) 
@@ -1612,27 +1607,27 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     }
     
     
-    // compute screen space normal / albedo
-    vec4 oldAlbedo = make_vec4(prev_albedo);
-    vec4 oldNormal = make_vec4(prev_normal);
-    if (any(isnan(oldAlbedo))) oldAlbedo = vec4(0.f);
-    if (any(isnan(oldNormal))) oldNormal = vec4(0.f);
-    vec4 newAlbedo = vec4(primaryAlbedo.x, primaryAlbedo.y, primaryAlbedo.z, 1.f);
-    vec4 accumAlbedo = (newAlbedo + float(LP.frameID) * oldAlbedo) / float(LP.frameID + 1);
-    vec4 newNormal = vec4(make_vec3(primaryNormal), 1.f);
-    if (!all(equal(make_vec3(primaryNormal), vec3(0.f, 0.f, 0.f)))) {
-        glm::quat r0 = glm::quat_cast(LP.viewT0);
-        glm::quat r1 = glm::quat_cast(LP.viewT1);
-        glm::quat rot = (glm::all(glm::equal(r0, r1))) ? r0 : glm::slerp(r0, r1, time);
-        vec3 tmp = normalize(glm::mat3_cast(rot) * make_vec3(primaryNormal));
-        tmp = normalize(vec3(LP.proj * vec4(tmp, 0.f)));
-        newNormal = vec4(tmp, 1.f);
-    }
-    vec4 accumNormal = (newNormal + float(LP.frameID) * oldNormal) / float(LP.frameID + 1);
+    // // compute screen space normal / albedo
+    // vec4 oldAlbedo = make_vec4(prev_albedo);
+    // vec4 oldNormal = make_vec4(prev_normal);
+    // if (any(isnan(oldAlbedo))) oldAlbedo = vec4(0.f);
+    // if (any(isnan(oldNormal))) oldNormal = vec4(0.f);
+    // vec4 newAlbedo = vec4(primaryAlbedo.x, primaryAlbedo.y, primaryAlbedo.z, 1.f);
+    // vec4 accumAlbedo = (newAlbedo + float(LP.frameID) * oldAlbedo) / float(LP.frameID + 1);
+    // vec4 newNormal = vec4(make_vec3(primaryNormal), 1.f);
+    // if (!all(equal(make_vec3(primaryNormal), vec3(0.f, 0.f, 0.f)))) {
+    //     glm::quat r0 = glm::quat_cast(LP.viewT0);
+    //     glm::quat r1 = glm::quat_cast(LP.viewT1);
+    //     glm::quat rot = (glm::all(glm::equal(r0, r1))) ? r0 : glm::slerp(r0, r1, time);
+    //     vec3 tmp = normalize(glm::mat3_cast(rot) * make_vec3(primaryNormal));
+    //     tmp = normalize(vec3(LP.proj * vec4(tmp, 0.f)));
+    //     newNormal = vec4(tmp, 1.f);
+    // }
+    // vec4 accumNormal = (newNormal + float(LP.frameID) * oldNormal) / float(LP.frameID + 1);
 
     // save data to frame buffers
     accumPtr[fbOfs] = accum_color;
     fbPtr[fbOfs] = accum_color;
-    albedoPtr[fbOfs] = make_float4(accumAlbedo);
-    normalPtr[fbOfs] = make_float4(accumNormal);    
+    // albedoPtr[fbOfs] = make_float4(accumAlbedo);
+    // normalPtr[fbOfs] = make_float4(accumNormal);    
 }
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 3f415090..487dead4 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -23,6 +23,7 @@
 #define PBRLUT_IMPLEMENTATION
 #include <nvisii/utilities/ggx_lookup_tables.h>
 #include <nvisii/utilities/procedural_sky.h>
+#include <nvisii/utilities/work_distribution.h>
 
 #include <thread>
 #include <future>
@@ -89,6 +90,8 @@ static struct OptixData {
     LaunchParams LP;
     GLuint imageTexID = -1;
     cudaGraphicsResource_t cudaResourceTex;
+    OWLBuffer sampleIndexBuffer;
+
     OWLBuffer frameBuffer;
     OWLBuffer normalBuffer;
     OWLBuffer albedoBuffer;
@@ -184,7 +187,7 @@ static struct NVISII {
     bool headlessMode;
     std::function<void()> callback;
     std::recursive_mutex callbackMutex;
-
+    StaticWorkDistribution wd;
 } NVISII;
 
 void applyStyle()
@@ -344,30 +347,6 @@ owl4x3f glmToOWL(glm::mat4 &xfm){
     return oxfm;
 }
 
-OWLLaunchParams launchParamsCreate(OWLContext context, size_t size, OWLVarDecl *vars, size_t numVars)
-{
-    return owlParamsCreate(context, size, vars, numVars);
-}
-
-void launchParamsSetBuffer(OWLLaunchParams params, const char* varName, OWLBuffer buffer)
-{
-    owlParamsSetBuffer(params, varName, buffer);
-}
-
-void launchParamsSetRaw(OWLLaunchParams params, const char* varName, const void* data)
-{
-    owlParamsSetRaw(params, varName, data);
-}
-
-void launchParamsSetTexture(OWLLaunchParams params, const char* varName, OWLTexture texture)
-{
-    owlParamsSetTexture(params, varName, texture);
-}
-
-void launchParamsSetGroup(OWLLaunchParams params, const char *varName, OWLGroup group) {
-    owlParamsSetGroup(params, varName, group);
-}
-
 void synchronizeDevices()
 {
     for (int i = 0; i < getDeviceCount(); i++) {
@@ -426,9 +405,22 @@ void initializeFrameBuffer(int fbWidth, int fbHeight) {
     synchronizeDevices();
 }
 
+extern "C" void fillSamplesCUDA(
+        int32_t  num_samples,
+        cudaStream_t stream,
+        int32_t  gpu_idx,
+        int32_t  num_gpus,
+        int32_t  width,
+        int32_t  height,
+        int2*    samples );
+
 void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
 {
     auto &OD = OptixData;
+    uint32_t numGPUs = owlGetDeviceCount(OD.context);
+
+    NVISII.wd.setRasterSize( width, height );
+    NVISII.wd.setNumGPUs( numGPUs );
 
     OD.LP.frameSize.x = width;
     OD.LP.frameSize.y = height;
@@ -439,6 +431,23 @@ void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
     owlBufferResize(OD.mvecBuffer, width * height);    
     owlBufferResize(OD.accumBuffer, width * height);
     
+    owlBufferResize(OD.sampleIndexBuffer, NVISII.wd.numSamples());
+
+    for (uint32_t i = 0; i < numGPUs; ++i)
+    {
+        cudaSetDevice( i );
+        fillSamplesCUDA(
+            NVISII.wd.numSamples(),
+            owlContextGetStream(OD.context, i),
+            i,
+            numGPUs,
+            width,
+            height,
+            (int2*)owlBufferGetPointer(OD.sampleIndexBuffer, i)
+        );
+    }
+    cudaSetDevice(0);
+
     // Reconfigure denoiser
     optixDenoiserComputeMemoryResources(OD.denoiser, OD.LP.frameSize.x, OD.LP.frameSize.y, &OD.denoiserSizes);
     uint64_t scratchSizeInBytes;
@@ -499,6 +508,7 @@ void initializeOptix(bool headless)
     
     /* Setup Optix Launch Params */
     OWLVarDecl launchParamVars[] = {
+        { "sampleIndexBuffer",       OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, sampleIndexBuffer)},
         { "frameSize",               OWL_USER_TYPE(glm::ivec2),         OWL_OFFSETOF(LaunchParams, frameSize)},
         { "frameID",                 OWL_USER_TYPE(uint64_t),           OWL_OFFSETOF(LaunchParams, frameID)},
         { "frameBuffer",             OWL_BUFPTR,                        OWL_OFFSETOF(LaunchParams, frameBuffer)},
@@ -563,27 +573,32 @@ void initializeOptix(bool headless)
         { "enableDomeSampling", OWL_USER_TYPE(bool),               OWL_OFFSETOF(LaunchParams, enableDomeSampling)},
         { /* sentinel to mark end of list */ }
     };
-    OD.launchParams = launchParamsCreate(OD.context, sizeof(LaunchParams), launchParamVars, -1);
+    OD.launchParams = owlParamsCreate(OD.context, sizeof(LaunchParams), launchParamVars, -1);
     
     /* Create AOV Buffers */
     if (!headless) {
         initializeFrameBuffer(512, 512);        
     }
 
-    OD.frameBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    NVISII.wd.setRasterSize( 512, 512 );
+    NVISII.wd.setNumGPUs( owlGetDeviceCount(OD.context) );
+    OD.sampleIndexBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(int2), NVISII.wd.numSamples(), nullptr);
+    owlParamsSetBuffer(OD.launchParams, "sampleIndexBuffer", OD.sampleIndexBuffer);
+
+    OD.frameBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
     OD.accumBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    OD.normalBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    OD.albedoBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    OD.scratchBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    OD.mvecBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.normalBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+    OD.albedoBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+    OD.scratchBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+    OD.mvecBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
     OD.LP.frameSize = glm::ivec2(512, 512);
-    launchParamsSetBuffer(OD.launchParams, "frameBuffer", OD.frameBuffer);
-    launchParamsSetBuffer(OD.launchParams, "normalBuffer", OD.normalBuffer);
-    launchParamsSetBuffer(OD.launchParams, "albedoBuffer", OD.albedoBuffer);
-    launchParamsSetBuffer(OD.launchParams, "scratchBuffer", OD.scratchBuffer);
-    launchParamsSetBuffer(OD.launchParams, "mvecBuffer", OD.mvecBuffer);
-    launchParamsSetBuffer(OD.launchParams, "accumPtr", OD.accumBuffer);
-    launchParamsSetRaw(OD.launchParams, "frameSize", &OD.LP.frameSize);
+    owlParamsSetBuffer(OD.launchParams, "frameBuffer", OD.frameBuffer);
+    owlParamsSetBuffer(OD.launchParams, "normalBuffer", OD.normalBuffer);
+    owlParamsSetBuffer(OD.launchParams, "albedoBuffer", OD.albedoBuffer);
+    owlParamsSetBuffer(OD.launchParams, "scratchBuffer", OD.scratchBuffer);
+    owlParamsSetBuffer(OD.launchParams, "mvecBuffer", OD.mvecBuffer);
+    owlParamsSetBuffer(OD.launchParams, "accumPtr", OD.accumBuffer);
+    owlParamsSetRaw(OD.launchParams, "frameSize", &OD.LP.frameSize);
 
     /* Create Component Buffers */
     // note, extra textures reserved for internal use
@@ -606,24 +621,24 @@ void initializeOptix(bool headless)
     OD.indexListsBuffer          = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
     OD.textureObjectsBuffer      = owlDeviceBufferCreate(OD.context, OWL_TEXTURE,                        Texture::getCount() + NUM_MAT_PARAMS * Material::getCount(),   nullptr);
 
-    launchParamsSetBuffer(OD.launchParams, "entities",             OD.entityBuffer);
-    launchParamsSetBuffer(OD.launchParams, "transforms",           OD.transformBuffer);
-    launchParamsSetBuffer(OD.launchParams, "cameras",              OD.cameraBuffer);
-    launchParamsSetBuffer(OD.launchParams, "materials",            OD.materialBuffer);
-    launchParamsSetBuffer(OD.launchParams, "meshes",               OD.meshBuffer);
-    launchParamsSetBuffer(OD.launchParams, "lights",               OD.lightBuffer);
-    launchParamsSetBuffer(OD.launchParams, "textures",             OD.textureBuffer);
-    launchParamsSetBuffer(OD.launchParams, "volumes",              OD.volumeBuffer);
-    launchParamsSetBuffer(OD.launchParams, "lightEntities",        OD.lightEntitiesBuffer);
-    launchParamsSetBuffer(OD.launchParams, "surfaceInstanceToEntity",  OD.surfaceInstanceToEntityBuffer);
-    launchParamsSetBuffer(OD.launchParams, "volumeInstanceToEntity",  OD.volumeInstanceToEntityBuffer);
-    launchParamsSetBuffer(OD.launchParams, "vertexLists",          OD.vertexListsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "normalLists",          OD.normalListsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "tangentLists",          OD.tangentListsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "texCoordLists",        OD.texCoordListsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "indexLists",           OD.indexListsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "textureObjects",       OD.textureObjectsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "volumeHandles",       OD.volumeHandlesBuffer);
+    owlParamsSetBuffer(OD.launchParams, "entities",             OD.entityBuffer);
+    owlParamsSetBuffer(OD.launchParams, "transforms",           OD.transformBuffer);
+    owlParamsSetBuffer(OD.launchParams, "cameras",              OD.cameraBuffer);
+    owlParamsSetBuffer(OD.launchParams, "materials",            OD.materialBuffer);
+    owlParamsSetBuffer(OD.launchParams, "meshes",               OD.meshBuffer);
+    owlParamsSetBuffer(OD.launchParams, "lights",               OD.lightBuffer);
+    owlParamsSetBuffer(OD.launchParams, "textures",             OD.textureBuffer);
+    owlParamsSetBuffer(OD.launchParams, "volumes",              OD.volumeBuffer);
+    owlParamsSetBuffer(OD.launchParams, "lightEntities",        OD.lightEntitiesBuffer);
+    owlParamsSetBuffer(OD.launchParams, "surfaceInstanceToEntity",  OD.surfaceInstanceToEntityBuffer);
+    owlParamsSetBuffer(OD.launchParams, "volumeInstanceToEntity",  OD.volumeInstanceToEntityBuffer);
+    owlParamsSetBuffer(OD.launchParams, "vertexLists",          OD.vertexListsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "normalLists",          OD.normalListsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "tangentLists",          OD.tangentListsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "texCoordLists",        OD.texCoordListsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "indexLists",           OD.indexListsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "textureObjects",       OD.textureObjectsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "volumeHandles",       OD.volumeHandlesBuffer);
 
     uint32_t meshCount = Mesh::getCount();
     OD.vertexLists.resize(meshCount);
@@ -647,13 +662,13 @@ void initializeOptix(bool headless)
 
     OD.LP.environmentMapID = -1;
     OD.LP.environmentMapRotation = glm::quat(1,0,0,0);
-    launchParamsSetRaw(OD.launchParams, "environmentMapID", &OD.LP.environmentMapID);
-    launchParamsSetRaw(OD.launchParams, "environmentMapRotation", &OD.LP.environmentMapRotation);
+    owlParamsSetRaw(OD.launchParams, "environmentMapID", &OD.LP.environmentMapID);
+    owlParamsSetRaw(OD.launchParams, "environmentMapRotation", &OD.LP.environmentMapRotation);
 
-    launchParamsSetBuffer(OD.launchParams, "environmentMapRows", OD.environmentMapRowsBuffer);
-    launchParamsSetBuffer(OD.launchParams, "environmentMapCols", OD.environmentMapColsBuffer);
-    launchParamsSetRaw(OD.launchParams, "environmentMapWidth", &OD.LP.environmentMapWidth);
-    launchParamsSetRaw(OD.launchParams, "environmentMapHeight", &OD.LP.environmentMapHeight);
+    owlParamsSetBuffer(OD.launchParams, "environmentMapRows", OD.environmentMapRowsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "environmentMapCols", OD.environmentMapColsBuffer);
+    owlParamsSetRaw(OD.launchParams, "environmentMapWidth", &OD.LP.environmentMapWidth);
+    owlParamsSetRaw(OD.launchParams, "environmentMapHeight", &OD.LP.environmentMapHeight);
 
     // OWLTexture GGX_E_AVG_LOOKUP = owlTexture2DCreate(OD.context,
     //                         OWL_TEXEL_FORMAT_R32F,
@@ -673,22 +688,22 @@ void initializeOptix(bool headless)
     // launchParamsSetTexture(OD.launchParams, "GGX_E_LOOKUP",     GGX_E_LOOKUP);
     
     OD.LP.numLightEntities = uint32_t(OD.lightEntities.size());
-    launchParamsSetRaw(OD.launchParams, "numLightEntities", &OD.LP.numLightEntities);
-    launchParamsSetRaw(OD.launchParams, "domeLightIntensity", &OD.LP.domeLightIntensity);
-    launchParamsSetRaw(OD.launchParams, "domeLightExposure", &OD.LP.domeLightExposure);
-    launchParamsSetRaw(OD.launchParams, "domeLightColor", &OD.LP.domeLightColor);
-    launchParamsSetRaw(OD.launchParams, "directClamp", &OD.LP.directClamp);
-    launchParamsSetRaw(OD.launchParams, "indirectClamp", &OD.LP.indirectClamp);
-    launchParamsSetRaw(OD.launchParams, "maxDiffuseDepth", &OD.LP.maxDiffuseDepth);
-    launchParamsSetRaw(OD.launchParams, "maxGlossyDepth", &OD.LP.maxGlossyDepth);
-    launchParamsSetRaw(OD.launchParams, "maxTransparencyDepth", &OD.LP.maxTransparencyDepth);
-    launchParamsSetRaw(OD.launchParams, "maxTransmissionDepth", &OD.LP.maxTransmissionDepth);
-    launchParamsSetRaw(OD.launchParams, "maxVolumeDepth", &OD.LP.maxVolumeDepth);
-    launchParamsSetRaw(OD.launchParams, "numLightSamples", &OD.LP.numLightSamples);
-    launchParamsSetRaw(OD.launchParams, "seed", &OD.LP.seed);
-    launchParamsSetRaw(OD.launchParams, "xPixelSamplingInterval", &OD.LP.xPixelSamplingInterval);
-    launchParamsSetRaw(OD.launchParams, "yPixelSamplingInterval", &OD.LP.yPixelSamplingInterval);
-    launchParamsSetRaw(OD.launchParams, "timeSamplingInterval", &OD.LP.timeSamplingInterval);
+    owlParamsSetRaw(OD.launchParams, "numLightEntities", &OD.LP.numLightEntities);
+    owlParamsSetRaw(OD.launchParams, "domeLightIntensity", &OD.LP.domeLightIntensity);
+    owlParamsSetRaw(OD.launchParams, "domeLightExposure", &OD.LP.domeLightExposure);
+    owlParamsSetRaw(OD.launchParams, "domeLightColor", &OD.LP.domeLightColor);
+    owlParamsSetRaw(OD.launchParams, "directClamp", &OD.LP.directClamp);
+    owlParamsSetRaw(OD.launchParams, "indirectClamp", &OD.LP.indirectClamp);
+    owlParamsSetRaw(OD.launchParams, "maxDiffuseDepth", &OD.LP.maxDiffuseDepth);
+    owlParamsSetRaw(OD.launchParams, "maxGlossyDepth", &OD.LP.maxGlossyDepth);
+    owlParamsSetRaw(OD.launchParams, "maxTransparencyDepth", &OD.LP.maxTransparencyDepth);
+    owlParamsSetRaw(OD.launchParams, "maxTransmissionDepth", &OD.LP.maxTransmissionDepth);
+    owlParamsSetRaw(OD.launchParams, "maxVolumeDepth", &OD.LP.maxVolumeDepth);
+    owlParamsSetRaw(OD.launchParams, "numLightSamples", &OD.LP.numLightSamples);
+    owlParamsSetRaw(OD.launchParams, "seed", &OD.LP.seed);
+    owlParamsSetRaw(OD.launchParams, "xPixelSamplingInterval", &OD.LP.xPixelSamplingInterval);
+    owlParamsSetRaw(OD.launchParams, "yPixelSamplingInterval", &OD.LP.yPixelSamplingInterval);
+    owlParamsSetRaw(OD.launchParams, "timeSamplingInterval", &OD.LP.timeSamplingInterval);
 
     OWLVarDecl trianglesGeomVars[] = {{/* sentinel to mark end of list */}};
     OD.trianglesGeomType = geomTypeCreate(OD.context, OWL_GEOM_TRIANGLES, sizeof(TrianglesGeomData), trianglesGeomVars,-1);
@@ -738,7 +753,7 @@ void initializeOptix(bool headless)
     OWLGroup surfacesIAS = instanceGroupCreate(OD.context, 1);
     instanceGroupSetChild(surfacesIAS, 0, OD.placeholderGroup); 
     groupBuildAccel(surfacesIAS);
-    launchParamsSetGroup(OD.launchParams, "surfacesIAS", surfacesIAS);
+    owlParamsSetGroup(OD.launchParams, "surfacesIAS", surfacesIAS);
 
     OWLGeom userGeom = owlGeomCreate(OD.context, OD.volumeGeomType);
     owlGeomSetPrimCount(userGeom, 1);
@@ -751,7 +766,7 @@ void initializeOptix(bool headless)
     OWLGroup volumesIAS = instanceGroupCreate(OD.context, 1);
     instanceGroupSetChild(volumesIAS, 0, OD.placeholderUserGroup); 
     groupBuildAccel(volumesIAS);
-    launchParamsSetGroup(OD.launchParams, "volumesIAS", volumesIAS);
+    owlParamsSetGroup(OD.launchParams, "volumesIAS", volumesIAS);
 
     // Build *SBT* required to trace the groups   
     owlBuildPipeline(OD.context);
@@ -1037,7 +1052,7 @@ void setIndirectLightingClamp(float clamp)
 {
     clamp = std::max(float(clamp), float(0.f));
     OptixData.LP.indirectClamp = clamp;
-    launchParamsSetRaw(OptixData.launchParams, "indirectClamp", &OptixData.LP.indirectClamp);
+    owlParamsSetRaw(OptixData.launchParams, "indirectClamp", &OptixData.LP.indirectClamp);
     resetAccumulation();
 }
 
@@ -1045,7 +1060,7 @@ void setDirectLightingClamp(float clamp)
 {
     clamp = std::max(float(clamp), float(0.f));
     OptixData.LP.directClamp = clamp;
-    launchParamsSetRaw(OptixData.launchParams, "directClamp", &OptixData.LP.directClamp);
+    owlParamsSetRaw(OptixData.launchParams, "directClamp", &OptixData.LP.directClamp);
     resetAccumulation();
 }
 
@@ -1062,11 +1077,11 @@ void setMaxBounceDepth(
     OptixData.LP.maxTransmissionDepth = transmissionDepth;
     OptixData.LP.maxVolumeDepth = volumeDepth;
     
-    launchParamsSetRaw(OptixData.launchParams, "maxDiffuseDepth", &OptixData.LP.maxDiffuseDepth);
-    launchParamsSetRaw(OptixData.launchParams, "maxGlossyDepth", &OptixData.LP.maxGlossyDepth);
-    launchParamsSetRaw(OptixData.launchParams, "maxTransparencyDepth", &OptixData.LP.maxTransparencyDepth);
-    launchParamsSetRaw(OptixData.launchParams, "maxTransmissionDepth", &OptixData.LP.maxTransmissionDepth);
-    launchParamsSetRaw(OptixData.launchParams, "maxVolumeDepth", &OptixData.LP.maxVolumeDepth);
+    owlParamsSetRaw(OptixData.launchParams, "maxDiffuseDepth", &OptixData.LP.maxDiffuseDepth);
+    owlParamsSetRaw(OptixData.launchParams, "maxGlossyDepth", &OptixData.LP.maxGlossyDepth);
+    owlParamsSetRaw(OptixData.launchParams, "maxTransparencyDepth", &OptixData.LP.maxTransparencyDepth);
+    owlParamsSetRaw(OptixData.launchParams, "maxTransmissionDepth", &OptixData.LP.maxTransmissionDepth);
+    owlParamsSetRaw(OptixData.launchParams, "maxVolumeDepth", &OptixData.LP.maxVolumeDepth);
     resetAccumulation();
 }
 
@@ -1081,7 +1096,7 @@ void setLightSampleCount(uint32_t count)
             std::string("Error: number of light samples must be between 1 and ") 
             + std::to_string(MAX_LIGHT_SAMPLES));
     OptixData.LP.numLightSamples = count;
-    launchParamsSetRaw(OptixData.launchParams, "numLightSamples", &OptixData.LP.numLightSamples);
+    owlParamsSetRaw(OptixData.launchParams, "numLightSamples", &OptixData.LP.numLightSamples);
     resetAccumulation();
 }
 
@@ -1089,15 +1104,15 @@ void samplePixelArea(vec2 xSampleInterval, vec2 ySampleInterval)
 {
     OptixData.LP.xPixelSamplingInterval = xSampleInterval;
     OptixData.LP.yPixelSamplingInterval = ySampleInterval;
-    launchParamsSetRaw(OptixData.launchParams, "xPixelSamplingInterval", &OptixData.LP.xPixelSamplingInterval);
-    launchParamsSetRaw(OptixData.launchParams, "yPixelSamplingInterval", &OptixData.LP.yPixelSamplingInterval);
+    owlParamsSetRaw(OptixData.launchParams, "xPixelSamplingInterval", &OptixData.LP.xPixelSamplingInterval);
+    owlParamsSetRaw(OptixData.launchParams, "yPixelSamplingInterval", &OptixData.LP.yPixelSamplingInterval);
     resetAccumulation();
 }
 
 void sampleTimeInterval(vec2 sampleTimeInterval)
 {
     OptixData.LP.timeSamplingInterval = sampleTimeInterval;
-    launchParamsSetRaw(OptixData.launchParams, "timeSamplingInterval", &OptixData.LP.timeSamplingInterval);
+    owlParamsSetRaw(OptixData.launchParams, "timeSamplingInterval", &OptixData.LP.timeSamplingInterval);
     resetAccumulation();
 }
 
@@ -1349,9 +1364,9 @@ void updateComponents()
 
         // Build IAS
         groupBuildAccel(OD.volumesIAS);
-        launchParamsSetGroup(OD.launchParams, "volumesIAS", OD.volumesIAS);
+        owlParamsSetGroup(OD.launchParams, "volumesIAS", OD.volumesIAS);
         groupBuildAccel(OD.surfacesIAS);
-        launchParamsSetGroup(OD.launchParams, "surfacesIAS", OD.surfacesIAS);
+        owlParamsSetGroup(OD.launchParams, "surfacesIAS", OD.surfacesIAS);
 
         // Now that IAS have changed, we need to rebuild SBT
         owlBuildSBT(OD.context);
@@ -1372,7 +1387,7 @@ void updateComponents()
         owlBufferResize(OptixData.lightEntitiesBuffer, OD.lightEntities.size());
         owlBufferUpload(OptixData.lightEntitiesBuffer, OD.lightEntities.data());
         OD.LP.numLightEntities = uint32_t(OD.lightEntities.size());
-        launchParamsSetRaw(OD.launchParams, "numLightEntities", &OD.LP.numLightEntities);
+        owlParamsSetRaw(OD.launchParams, "numLightEntities", &OD.LP.numLightEntities);
 
         // Finally, upload entity structs to the GPU.
         Entity::updateComponents();
@@ -1549,105 +1564,107 @@ void updateComponents()
 
 void updateLaunchParams()
 {
-    launchParamsSetRaw(OptixData.launchParams, "frameID", &OptixData.LP.frameID);
-    launchParamsSetRaw(OptixData.launchParams, "frameSize", &OptixData.LP.frameSize);
-    launchParamsSetRaw(OptixData.launchParams, "cameraEntity", &OptixData.LP.cameraEntity);
-    launchParamsSetRaw(OptixData.launchParams, "domeLightIntensity", &OptixData.LP.domeLightIntensity);
-    launchParamsSetRaw(OptixData.launchParams, "domeLightExposure", &OptixData.LP.domeLightExposure);
-    launchParamsSetRaw(OptixData.launchParams, "domeLightColor", &OptixData.LP.domeLightColor);
-    launchParamsSetRaw(OptixData.launchParams, "renderDataMode", &OptixData.LP.renderDataMode);
-    launchParamsSetRaw(OptixData.launchParams, "renderDataBounce", &OptixData.LP.renderDataBounce);
-    launchParamsSetRaw(OptixData.launchParams, "enableDomeSampling", &OptixData.LP.enableDomeSampling);
-    launchParamsSetRaw(OptixData.launchParams, "seed", &OptixData.LP.seed);
-    launchParamsSetRaw(OptixData.launchParams, "proj", &OptixData.LP.proj);
-    launchParamsSetRaw(OptixData.launchParams, "viewT0", &OptixData.LP.viewT0);
-    launchParamsSetRaw(OptixData.launchParams, "viewT1", &OptixData.LP.viewT1);
-
-    launchParamsSetRaw(OptixData.launchParams, "environmentMapID", &OptixData.LP.environmentMapID);
-    launchParamsSetRaw(OptixData.launchParams, "environmentMapRotation", &OptixData.LP.environmentMapRotation);
-    launchParamsSetBuffer(OptixData.launchParams, "environmentMapRows", OptixData.environmentMapRowsBuffer);
-    launchParamsSetBuffer(OptixData.launchParams, "environmentMapCols", OptixData.environmentMapColsBuffer);
-    launchParamsSetRaw(OptixData.launchParams, "environmentMapWidth", &OptixData.LP.environmentMapWidth);
-    launchParamsSetRaw(OptixData.launchParams, "environmentMapHeight", &OptixData.LP.environmentMapHeight);
-    launchParamsSetRaw(OptixData.launchParams, "sceneBBMin", &OptixData.LP.sceneBBMin);
-    launchParamsSetRaw(OptixData.launchParams, "sceneBBMax", &OptixData.LP.sceneBBMax);
+    owlParamsSetRaw(OptixData.launchParams, "frameID", &OptixData.LP.frameID);
+    owlParamsSetRaw(OptixData.launchParams, "frameSize", &OptixData.LP.frameSize);
+    owlParamsSetRaw(OptixData.launchParams, "cameraEntity", &OptixData.LP.cameraEntity);
+    owlParamsSetRaw(OptixData.launchParams, "domeLightIntensity", &OptixData.LP.domeLightIntensity);
+    owlParamsSetRaw(OptixData.launchParams, "domeLightExposure", &OptixData.LP.domeLightExposure);
+    owlParamsSetRaw(OptixData.launchParams, "domeLightColor", &OptixData.LP.domeLightColor);
+    owlParamsSetRaw(OptixData.launchParams, "renderDataMode", &OptixData.LP.renderDataMode);
+    owlParamsSetRaw(OptixData.launchParams, "renderDataBounce", &OptixData.LP.renderDataBounce);
+    owlParamsSetRaw(OptixData.launchParams, "enableDomeSampling", &OptixData.LP.enableDomeSampling);
+    owlParamsSetRaw(OptixData.launchParams, "seed", &OptixData.LP.seed);
+    owlParamsSetRaw(OptixData.launchParams, "proj", &OptixData.LP.proj);
+    owlParamsSetRaw(OptixData.launchParams, "viewT0", &OptixData.LP.viewT0);
+    owlParamsSetRaw(OptixData.launchParams, "viewT1", &OptixData.LP.viewT1);
+
+    owlParamsSetRaw(OptixData.launchParams, "environmentMapID", &OptixData.LP.environmentMapID);
+    owlParamsSetRaw(OptixData.launchParams, "environmentMapRotation", &OptixData.LP.environmentMapRotation);
+    owlParamsSetBuffer(OptixData.launchParams, "environmentMapRows", OptixData.environmentMapRowsBuffer);
+    owlParamsSetBuffer(OptixData.launchParams, "environmentMapCols", OptixData.environmentMapColsBuffer);
+    owlParamsSetRaw(OptixData.launchParams, "environmentMapWidth", &OptixData.LP.environmentMapWidth);
+    owlParamsSetRaw(OptixData.launchParams, "environmentMapHeight", &OptixData.LP.environmentMapHeight);
+    owlParamsSetRaw(OptixData.launchParams, "sceneBBMin", &OptixData.LP.sceneBBMin);
+    owlParamsSetRaw(OptixData.launchParams, "sceneBBMax", &OptixData.LP.sceneBBMax);
 
     OptixData.LP.frameID ++;
 }
 
-// Different GPUs have different local framebuffers.
-// This function combines those framebuffers on the CPU, then uploads results to device 0.
-void mergeFrameBuffers() {
-    int deviceCount = getDeviceCount();
-    int width = OptixData.LP.frameSize.x;
-    int height = OptixData.LP.frameSize.y;
-    if (deviceCount <= 1) return;
-
-    // synchronizeDevices();
-
-    std::vector<glm::vec4> fb_h(width * height);
-    std::vector<glm::vec4> fba_h(width * height);
-    std::vector<glm::vec4> fbn_h(width * height);
-    std::vector<std::vector<glm::vec4>> fb_hd(deviceCount);
-    std::vector<std::vector<glm::vec4>> fba_hd(deviceCount);
-    std::vector<std::vector<glm::vec4>> fbn_hd(deviceCount);
-    for (uint32_t i = 0; i < deviceCount; ++i){
-        fb_hd[i] = std::vector<glm::vec4>(width * height);
-        fba_hd[i] = std::vector<glm::vec4>(width * height);
-        fbn_hd[i] = std::vector<glm::vec4>(width * height);
-        void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,i);
-        void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,i);
-        void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,i);
-        cudaMemcpyAsync((void*)fb_hd[i].data(), (void*)fb_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-        cudaMemcpyAsync((void*)fba_hd[i].data(), (void*)fba_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-        cudaMemcpyAsync((void*)fbn_hd[i].data(), (void*)fbn_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-    }
-    // synchronizeDevices();
-
-    // note, GPUs render 32xN strips
-    for (uint32_t y = 0; y < height; ++y) {
-        for (uint32_t x = 0; x < width; x += 32) {
-            if (x >= width) continue;
-            int deviceThatIsResponsible = (x>>5) % deviceCount;
-            {
-                glm::vec4* A = fb_h.data() + (y * width) + x;
-                glm::vec4* B = fb_hd[deviceThatIsResponsible].data() + (y * width) + x;
-                memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
-            }
-            {
-                glm::vec4* A = fba_h.data() + (y * width) + x;
-                glm::vec4* B = fba_hd[deviceThatIsResponsible].data() + (y * width) + x;
-                memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
-            }
-            {
-                glm::vec4* A = fbn_h.data() + (y * width) + x;
-                glm::vec4* B = fbn_hd[deviceThatIsResponsible].data() + (y * width) + x;
-                memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
-            }
-        }
-    }
-
-    // // note, GPUs render 32xN strips
-    // for (uint32_t y = 0; y < height; ++y) {
-    //     for (uint32_t x = 0; x < width; x += 32) {
-    //         int deviceThatIsResponsible = (x>>5) % deviceCount;
-    //         glm::vec4* A = fb_h.data() + (y * width) + x;
-    //         glm::vec4* B = ((glm::vec4*)fb_d[deviceThatIsResponsible]) + (y * width) + x;
-    //         cudaMemcpyAsync((void*)A, (void*)B, min(32, int(width - x)) * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-    //     }
-    // }
-
+// Update: This is still prohibitively slow. Official OptiX samples use host pinned memory. 
+// Moving to that approach...
+// // Different GPUs have different local framebuffers.
+// // This function combines those framebuffers on the CPU, then uploads results to device 0.
+// void mergeFrameBuffers() {
+//     int deviceCount = getDeviceCount();
+//     int width = OptixData.LP.frameSize.x;
+//     int height = OptixData.LP.frameSize.y;
+//     if (deviceCount <= 1) return;
+
+//     // synchronizeDevices();
+
+//     std::vector<glm::vec4> fb_h(width * height);
+//     std::vector<glm::vec4> fba_h(width * height);
+//     std::vector<glm::vec4> fbn_h(width * height);
+//     std::vector<std::vector<glm::vec4>> fb_hd(deviceCount);
+//     std::vector<std::vector<glm::vec4>> fba_hd(deviceCount);
+//     std::vector<std::vector<glm::vec4>> fbn_hd(deviceCount);
+//     for (uint32_t i = 0; i < deviceCount; ++i){
+//         fb_hd[i] = std::vector<glm::vec4>(width * height);
+//         fba_hd[i] = std::vector<glm::vec4>(width * height);
+//         fbn_hd[i] = std::vector<glm::vec4>(width * height);
+//         void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,i);
+//         void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,i);
+//         void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,i);
+//         cudaMemcpyAsync((void*)fb_hd[i].data(), (void*)fb_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+//         cudaMemcpyAsync((void*)fba_hd[i].data(), (void*)fba_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+//         cudaMemcpyAsync((void*)fbn_hd[i].data(), (void*)fbn_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+//     }
+//     // synchronizeDevices();
+
+//     // note, GPUs render 32xN strips
+//     for (uint32_t y = 0; y < height; ++y) {
+//         for (uint32_t x = 0; x < width; x += 32) {
+//             if (x >= width) continue;
+//             int deviceThatIsResponsible = (x>>5) % deviceCount;
+//             {
+//                 glm::vec4* A = fb_h.data() + (y * width) + x;
+//                 glm::vec4* B = fb_hd[deviceThatIsResponsible].data() + (y * width) + x;
+//                 memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
+//             }
+//             {
+//                 glm::vec4* A = fba_h.data() + (y * width) + x;
+//                 glm::vec4* B = fba_hd[deviceThatIsResponsible].data() + (y * width) + x;
+//                 memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
+//             }
+//             {
+//                 glm::vec4* A = fbn_h.data() + (y * width) + x;
+//                 glm::vec4* B = fbn_hd[deviceThatIsResponsible].data() + (y * width) + x;
+//                 memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
+//             }
+//         }
+//     }
 
-    // cudaMemcpyAsync(fb_h.data(), fb_d[1], fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-    synchronizeDevices();
-    void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,0);
-    void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,0);
-    void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,0);
-    cudaMemcpyAsync(fb_d, fb_h.data(), fb_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
-    cudaMemcpyAsync(fba_d, fba_h.data(), fba_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
-    cudaMemcpyAsync(fbn_d, fbn_h.data(), fbn_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
-    // synchronizeDevices();
-}
+//     // // note, GPUs render 32xN strips
+//     // for (uint32_t y = 0; y < height; ++y) {
+//     //     for (uint32_t x = 0; x < width; x += 32) {
+//     //         int deviceThatIsResponsible = (x>>5) % deviceCount;
+//     //         glm::vec4* A = fb_h.data() + (y * width) + x;
+//     //         glm::vec4* B = ((glm::vec4*)fb_d[deviceThatIsResponsible]) + (y * width) + x;
+//     //         cudaMemcpyAsync((void*)A, (void*)B, min(32, int(width - x)) * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+//     //     }
+//     // }
+
+
+//     // cudaMemcpyAsync(fb_h.data(), fb_d[1], fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
+//     synchronizeDevices();
+//     void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,0);
+//     void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,0);
+//     void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,0);
+//     cudaMemcpyAsync(fb_d, fb_h.data(), fb_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
+//     cudaMemcpyAsync(fba_d, fba_h.data(), fba_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
+//     cudaMemcpyAsync(fbn_d, fbn_h.data(), fbn_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
+//     // synchronizeDevices();
+// }
 
 void denoiseImage() {
     synchronizeDevices();
@@ -1976,10 +1993,9 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
             }
 
             updateLaunchParams();
-            owlLaunch2D(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, OptixData.launchParams);
+            owlLaunch2D(OptixData.rayGen, NVISII.wd.numSamples(), 1, OptixData.launchParams);
 
             if (!NVISII.headlessMode) {
-                mergeFrameBuffers();
                 if (OptixData.enableDenoiser)
                 {
                     denoiseImage();
@@ -2006,7 +2022,6 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
 
         synchronizeDevices();
 
-        mergeFrameBuffers();
         if (OptixData.enableDenoiser)
         {
             denoiseImage();
@@ -2130,8 +2145,7 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
             }
 
             updateLaunchParams();
-            owlLaunch2D(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, OptixData.launchParams);
-            mergeFrameBuffers();
+            owlLaunch2D(OptixData.rayGen, NVISII.wd.numSamples(), 1, OptixData.launchParams);
             // Dont run denoiser to raw data rendering
             // if (OptixData.enableDenoiser)
             // {
@@ -2430,8 +2444,7 @@ void initializeInteractive(
                 updateFrameBuffer();
                 updateComponents();
                 updateLaunchParams();
-                owlLaunch2D(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, OptixData.launchParams);
-                mergeFrameBuffers();
+                owlLaunch2D(OptixData.rayGen, NVISII.wd.numSamples(), 1, OptixData.launchParams);
                 if (OptixData.enableDenoiser)
                 {
                     denoiseImage();
diff --git a/src/nvisii/nvisii.cu b/src/nvisii/nvisii.cu
index 6a94855c..0d81d5d2 100644
--- a/src/nvisii/nvisii.cu
+++ b/src/nvisii/nvisii.cu
@@ -53,4 +53,39 @@ void reproject(glm::vec4 *sampleBuffer, glm::vec4 *t0AlbedoBuffer, glm::vec4 *t1
     dim3 gridSize = dim3 (bx, by);
     _reproject<<<gridSize,blockSize>>>(sampleBuffer, t0AlbedoBuffer, t1AlbedoBuffer, mvecBuffer, scratchBuffer, imageBuffer, true, width, height);
     _reproject<<<gridSize,blockSize>>>(sampleBuffer, t0AlbedoBuffer, t1AlbedoBuffer, mvecBuffer, scratchBuffer, imageBuffer, false, width, height);
-}
\ No newline at end of file
+}
+
+#include "work_distribution.h"
+
+extern "C" __global__ void fillSamples(
+        int   gpu_idx,
+        int   num_gpus,
+        int   width,
+        int   height,
+        int2* sample_indices )
+{
+    StaticWorkDistribution wd;
+    wd.setRasterSize( width, height );
+    wd.setNumGPUs( num_gpus );
+
+    const int sample_idx = blockIdx.x;
+    sample_indices[sample_idx] = wd.getSamplePixel( gpu_idx, sample_idx );
+}
+
+
+extern "C" __host__ void fillSamplesCUDA(
+        int          num_samples,
+        cudaStream_t stream,
+        int          gpu_idx,
+        int          num_gpus,
+        int          width,
+        int          height,
+        int2*        sample_indices )
+{
+    fillSamples<<<num_samples, 1, 0, stream>>>(
+        gpu_idx,
+        num_gpus,
+        width,
+        height,
+        sample_indices );
+}

From e0555a33a7dfab5c2fe0abe00f384ef31f0a5294 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Fri, 26 Feb 2021 17:34:51 -0700
Subject: [PATCH 05/55] some steps forward towards dynamic load balancing

---
 externals/owl                                |  2 +-
 include/nvisii/utilities/work_distribution.h |  2 +-
 src/nvisii/nvisii.cpp                        | 25 ++++++++++++++------
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/externals/owl b/externals/owl
index da110c5e..97d3acd6 160000
--- a/externals/owl
+++ b/externals/owl
@@ -1 +1 @@
-Subproject commit da110c5e1453c17c8e3567d187a69b9bc6082943
+Subproject commit 97d3acd6ad4d0e356f90959d34c8a94024a4fa70
diff --git a/include/nvisii/utilities/work_distribution.h b/include/nvisii/utilities/work_distribution.h
index 5f1c5a56..b02e93b8 100644
--- a/include/nvisii/utilities/work_distribution.h
+++ b/include/nvisii/utilities/work_distribution.h
@@ -30,7 +30,7 @@ class StaticWorkDistribution
     }
 
 
-    SWD_INLINE SWD_HOSTDEVICE int32_t numSamples( )
+    SWD_INLINE SWD_HOSTDEVICE int32_t numSamples( int32_t gpu_idx )
     {
         const int tile_strip_width  = TILE_WIDTH*m_num_gpus;
         const int tile_strip_height = TILE_HEIGHT;
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 487dead4..c3d0923b 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -430,14 +430,13 @@ void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
     owlBufferResize(OD.scratchBuffer, width * height);
     owlBufferResize(OD.mvecBuffer, width * height);    
     owlBufferResize(OD.accumBuffer, width * height);
-    
-    owlBufferResize(OD.sampleIndexBuffer, NVISII.wd.numSamples());
+    owlBufferResize(OD.sampleIndexBuffer, width * height);
 
     for (uint32_t i = 0; i < numGPUs; ++i)
     {
         cudaSetDevice( i );
         fillSamplesCUDA(
-            NVISII.wd.numSamples(),
+            NVISII.wd.numSamples(i),
             owlContextGetStream(OD.context, i),
             i,
             numGPUs,
@@ -582,7 +581,7 @@ void initializeOptix(bool headless)
 
     NVISII.wd.setRasterSize( 512, 512 );
     NVISII.wd.setNumGPUs( owlGetDeviceCount(OD.context) );
-    OD.sampleIndexBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(int2), NVISII.wd.numSamples(), nullptr);
+    OD.sampleIndexBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(int2), 512*512, nullptr);
     owlParamsSetBuffer(OD.launchParams, "sampleIndexBuffer", OD.sampleIndexBuffer);
 
     OD.frameBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
@@ -1993,7 +1992,10 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
             }
 
             updateLaunchParams();
-            owlLaunch2D(OptixData.rayGen, NVISII.wd.numSamples(), 1, OptixData.launchParams);
+            for (uint32_t deviceID = 0; deviceID < owlGetDeviceCount(OptixData.context); deviceID++) {
+                owlAsyncLaunch2DOnDevice(OptixData.rayGen, NVISII.wd.numSamples(deviceID), 1, deviceID, OptixData.launchParams);
+            }
+            owlLaunchSync(OptixData.launchParams);
 
             if (!NVISII.headlessMode) {
                 if (OptixData.enableDenoiser)
@@ -2145,7 +2147,12 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
             }
 
             updateLaunchParams();
-            owlLaunch2D(OptixData.rayGen, NVISII.wd.numSamples(), 1, OptixData.launchParams);
+
+            for (uint32_t deviceID = 0; deviceID < owlGetDeviceCount(OptixData.context); deviceID++) {
+                owlAsyncLaunch2DOnDevice(OptixData.rayGen, NVISII.wd.numSamples(deviceID), 1, deviceID, OptixData.launchParams);
+            }
+            owlLaunchSync(OptixData.launchParams);
+            
             // Dont run denoiser to raw data rendering
             // if (OptixData.enableDenoiser)
             // {
@@ -2444,7 +2451,11 @@ void initializeInteractive(
                 updateFrameBuffer();
                 updateComponents();
                 updateLaunchParams();
-                owlLaunch2D(OptixData.rayGen, NVISII.wd.numSamples(), 1, OptixData.launchParams);
+                for (uint32_t deviceID = 0; deviceID < owlGetDeviceCount(OptixData.context); deviceID++) {
+                    owlAsyncLaunch2DOnDevice(OptixData.rayGen, NVISII.wd.numSamples(deviceID), 1, deviceID, OptixData.launchParams);
+                }
+                owlLaunchSync(OptixData.launchParams);
+                
                 if (OptixData.enableDenoiser)
                 {
                     denoiseImage();

From 14cfdee1b59202180eadc681cee6ca0b8ffd3838 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sun, 28 Feb 2021 21:17:54 -0700
Subject: [PATCH 06/55] fixes to load balancing. Fixes to frame buffer when
 cuda visible devices set to non-zero value

---
 include/nvisii/nvisii.h                      |  36 +-
 include/nvisii/utilities/CMakeLists.txt      |   1 -
 include/nvisii/utilities/work_distribution.h |  74 ----
 src/nvisii/devicecode/launch_params.h        |   5 +-
 src/nvisii/devicecode/path_tracer.cu         |  79 ++--
 src/nvisii/nvisii.cpp                        | 410 ++++++++++++-------
 src/nvisii/nvisii.cu                         |  35 --
 7 files changed, 341 insertions(+), 299 deletions(-)
 delete mode 100644 include/nvisii/utilities/work_distribution.h

diff --git a/include/nvisii/nvisii.h b/include/nvisii/nvisii.h
index e7126249..d15f6220 100644
--- a/include/nvisii/nvisii.h
+++ b/include/nvisii/nvisii.h
@@ -300,10 +300,20 @@ void renderToFile(uint32_t width, uint32_t height, uint32_t samples_per_pixel, s
  * @param bounce The number of bounces required to reach the vertex whose metadata result should come from. A value of 0
  * would save data for objects directly visible to the camera, a value of 1 would save reflections/refractions, etc.
  * @param options Indicates the data to return. Current possible values include 
- * "none" for rendering out raw path traced data, "depth" to render the distance between the previous path vertex to the current one,
- * "position" for rendering out the world space position of the path vertex, "normal" for rendering out the world space normal of the 
- * path vertex, "entity_id" for rendering out the entity ID whose surface the path vertex hit, "denoise_normal" for rendering out
- * the normal buffer supplied to the Optix denoiser, and "denoise_albedo" for rendering out the albedo supplied to the Optix denoiser.   
+ * "none" for rendering out raw path traced data, 
+ * "depth" to render the distance between the previous path vertex to the current one,
+ * "ray_direction" to render the direction that the ray was traced in world space,
+ * "position" for rendering out the world space position of the path vertex, 
+ * "normal" for rendering out the world space normal of the path vertex, 
+ * "entity_id" for rendering out the entity ID whose surface the path vertex hit, 
+ * "base_color" for rendering out the surface base color, 
+ * "texture_coordinates" for rendering out the texture coordinates of the hit surface, 
+ * "screen_space_normal" for rendering out the normals of the hit surface in screen space, 
+ * "diffuse_motion_vectors" for rendering out screen space motion vectors for moving objects, 
+ * "denoise_normal" for rendering out the normal buffer supplied to the Optix denoiser, 
+ * "denoise_albedo" for rendering out the albedo supplied to the Optix denoiser,
+ * "heatmap" for rendering out the time it takes to render out each pixel,
+ * "device_id" for determining which GPU was used to render what pixel.
  * @param seed A seed used to initialize the random number generator.
 */
 std::vector<float> renderData(
@@ -319,10 +329,20 @@ std::vector<float> renderData(
  * @param bounce The number of bounces required to reach the vertex whose metadata result should come from. A value of 0
  * would save data for objects directly visible to the camera, a value of 1 would save reflections/refractions, etc.
  * @param options Indicates the data to return. Current possible values include 
- * "none" for rendering out raw path traced data, "depth" to render the distance between the previous path vertex to the current one,
- * "position" for rendering out the world space position of the path vertex, "normal" for rendering out the world space normal of the 
- * path vertex, "entity_id" for rendering out the entity ID whose surface the path vertex hit, "denoise_normal" for rendering out
- * the normal buffer supplied to the Optix denoiser, and "denoise_albedo" for rendering out the albedo supplied to the Optix denoiser.   
+ * "none" for rendering out raw path traced data, 
+ * "depth" to render the distance between the previous path vertex to the current one,
+ * "ray_direction" to render the direction that the ray was traced in world space,
+ * "position" for rendering out the world space position of the path vertex, 
+ * "normal" for rendering out the world space normal of the path vertex, 
+ * "entity_id" for rendering out the entity ID whose surface the path vertex hit, 
+ * "base_color" for rendering out the surface base color, 
+ * "texture_coordinates" for rendering out the texture coordinates of the hit surface, 
+ * "screen_space_normal" for rendering out the normals of the hit surface in screen space, 
+ * "diffuse_motion_vectors" for rendering out screen space motion vectors for moving objects, 
+ * "denoise_normal" for rendering out the normal buffer supplied to the Optix denoiser, 
+ * "denoise_albedo" for rendering out the albedo supplied to the Optix denoiser,
+ * "heatmap" for rendering out the time it takes to render out each pixel,
+ * "device_id" for determining which GPU was used to render what pixel.
  * @param file_path The path to use to save the file, including the extension. Supported extensions are EXR, HDR, and PNG
  * @param seed A seed used to initialize the random number generator.
 */
diff --git a/include/nvisii/utilities/CMakeLists.txt b/include/nvisii/utilities/CMakeLists.txt
index 84d88ba4..ee81cbbe 100644
--- a/include/nvisii/utilities/CMakeLists.txt
+++ b/include/nvisii/utilities/CMakeLists.txt
@@ -14,5 +14,4 @@ set(Utilities_HDR
 	${CMAKE_CURRENT_SOURCE_DIR}/singleton.h
 	${CMAKE_CURRENT_SOURCE_DIR}/version.h
 	${CMAKE_CURRENT_SOURCE_DIR}/procedural_sky.h
-	${CMAKE_CURRENT_SOURCE_DIR}/work_distribution.h
 	PARENT_SCOPE)
diff --git a/include/nvisii/utilities/work_distribution.h b/include/nvisii/utilities/work_distribution.h
deleted file mode 100644
index b02e93b8..00000000
--- a/include/nvisii/utilities/work_distribution.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma once
-
-#if defined(__CUDACC__) || defined(__CUDABE__)
-#    define SWD_HOSTDEVICE __host__ __device__
-#    define SWD_INLINE __forceinline__
-#    define CONST_STATIC_INIT( ... )
-#else
-#    define SWD_HOSTDEVICE
-#    define SWD_INLINE inline
-#    define CONST_STATIC_INIT( ... ) = __VA_ARGS__
-#endif
-
-#include <stdint.h>
-
-#include <vector_types.h>
-
-class StaticWorkDistribution
-{
-public:
-    SWD_INLINE SWD_HOSTDEVICE void setRasterSize( int width, int height )
-    {
-        m_width = width;
-        m_height = height;
-    }
-
-
-    SWD_INLINE SWD_HOSTDEVICE void setNumGPUs( int32_t num_gpus )
-    {
-        m_num_gpus = num_gpus;
-    }
-
-
-    SWD_INLINE SWD_HOSTDEVICE int32_t numSamples( int32_t gpu_idx )
-    {
-        const int tile_strip_width  = TILE_WIDTH*m_num_gpus;
-        const int tile_strip_height = TILE_HEIGHT;
-        const int num_tile_strip_cols = m_width /tile_strip_width  + ( m_width %tile_strip_width  == 0 ? 0 : 1 );
-        const int num_tile_strip_rows = m_height/tile_strip_height + ( m_height%tile_strip_height == 0 ? 0 : 1 );
-        return num_tile_strip_rows*num_tile_strip_cols*TILE_WIDTH*TILE_HEIGHT;
-    }
-
-
-    SWD_INLINE SWD_HOSTDEVICE int2 getSamplePixel( int32_t gpu_idx, int32_t sample_idx )
-    {
-        const int tile_strip_width  = TILE_WIDTH*m_num_gpus;
-        const int tile_strip_height = TILE_HEIGHT;
-        const int num_tile_strip_cols = m_width /tile_strip_width + ( m_width % tile_strip_width == 0 ? 0 : 1 );
-
-        const int tile_strip_idx     = sample_idx / (TILE_WIDTH*TILE_HEIGHT );
-        const int tile_strip_y       = tile_strip_idx / num_tile_strip_cols;
-        const int tile_strip_x       = tile_strip_idx - tile_strip_y * num_tile_strip_cols;
-        const int tile_strip_x_start = tile_strip_x * tile_strip_width;
-        const int tile_strip_y_start = tile_strip_y * tile_strip_height;
-
-        const int tile_pixel_idx     = sample_idx - ( tile_strip_idx * TILE_WIDTH*TILE_HEIGHT );
-        const int tile_pixel_y       = tile_pixel_idx / TILE_WIDTH;
-        const int tile_pixel_x       = tile_pixel_idx - tile_pixel_y * TILE_WIDTH;
-
-        const int tile_offset_x      = ( gpu_idx + tile_strip_y % m_num_gpus ) % m_num_gpus * TILE_WIDTH;
-
-        const int pixel_y = tile_strip_y_start + tile_pixel_y;
-        const int pixel_x = tile_strip_x_start + tile_pixel_x + tile_offset_x ;
-        return make_int2( pixel_x, pixel_y );
-    }
-
-
-private:
-    int32_t m_num_gpus = 0;
-    int32_t m_width    = 0;
-    int32_t m_height   = 0;
-
-    static const int32_t TILE_WIDTH  = 8;
-    static const int32_t TILE_HEIGHT = 4;
-};
diff --git a/src/nvisii/devicecode/launch_params.h b/src/nvisii/devicecode/launch_params.h
index ea5edb0f..717b2cf3 100644
--- a/src/nvisii/devicecode/launch_params.h
+++ b/src/nvisii/devicecode/launch_params.h
@@ -19,7 +19,7 @@
 #include "./buffer.h"
 
 struct LaunchParams {
-    Buffer<int2> sampleIndexBuffer;
+    Buffer<float> assignmentBuffer;
 
     glm::ivec2 frameSize;
     uint64_t frameID = 0;
@@ -113,7 +113,8 @@ enum RenderDataFlags : uint32_t {
   TRANSMISSION_INDIRECT_LIGHTING = 17,
   RAY_DIRECTION = 18,
   HEATMAP = 19,
-  TEXTURE_COORDINATES = 20
+  TEXTURE_COORDINATES = 20,
+  DEVICE_ID = 21
 };
 
 #define MAX_LIGHT_SAMPLES 10
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index 8d99d060..905db055 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -872,6 +872,18 @@ void saveHeatmapRenderData(
     renderData = make_float3(relClock);
 }
 
+__device__
+void saveDeviceAssignment(
+    float3 &renderData, 
+    int bounce,
+    uint32_t deviceIndex
+)
+{
+    auto &LP = optixLaunchParams;
+    if (LP.renderDataMode != RenderDataFlags::DEVICE_ID) return;
+    renderData = make_float3(deviceIndex);
+}
+
 __device__
 bool debugging() {
     #ifndef DEBUGGING
@@ -888,11 +900,21 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     auto &LP = optixLaunchParams;
     auto launchIndex = optixGetLaunchIndex().x;
     auto launchDim = optixGetLaunchDimensions().x;
-    
-    GET(const int2 pixelID, int2, LP.sampleIndexBuffer, launchIndex);
-    
-    // Work distribution might assign tiles that cross over image boundary
+    auto pixelID = make_int2(launchIndex % LP.frameSize.x, launchIndex / LP.frameSize.x);
+
+    // Terminate thread if current pixel not assigned to this device
+    GET(float start, float, LP.assignmentBuffer, self.deviceIndex);
+    GET(float stop, float, LP.assignmentBuffer, self.deviceIndex + 1);
+    start *= (LP.frameSize.x * LP.frameSize.y);
+    stop *= (LP.frameSize.x * LP.frameSize.y);
+
+    // if (launchIndex == 0) {
+    //     printf("device %d start %f stop %f\n", self.deviceIndex, start, stop);
+    // }
+
     if( pixelID.x > LP.frameSize.x-1 || pixelID.y > LP.frameSize.y-1 ) return;
+    if( (launchIndex < start) || (stop <= launchIndex) ) return;
+    // if (self.deviceIndex == 1) return;
     
     cudaTextureObject_t envTex = getEnvironmentTexture();    
     bool debug = (pixelID.x == int(LP.frameSize.x / 2) && pixelID.y == int(LP.frameSize.y / 2));
@@ -1559,6 +1581,9 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     // For segmentations, save heatmap metadata
     saveHeatmapRenderData(renderData, depth, start_clock);
 
+    // Device assignment data
+    saveDeviceAssignment(renderData, depth, self.deviceIndex);
+
     // clamp out any extreme fireflies
     glm::vec3 gillum = vec3(illum.x, illum.y, illum.z);
     glm::vec3 dillum = vec3(directIllum.x, directIllum.y, directIllum.z);
@@ -1588,12 +1613,12 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     auto fbOfs = pixelID.x+LP.frameSize.x * ((LP.frameSize.y - 1) -  pixelID.y);
     float4* accumPtr = (float4*) LP.accumPtr;
     float4* fbPtr = (float4*) LP.frameBuffer;
-    // float4* normalPtr = (float4*) LP.normalBuffer;
-    // float4* albedoPtr = (float4*) LP.albedoBuffer;
+    float4* normalPtr = (float4*) LP.normalBuffer;
+    float4* albedoPtr = (float4*) LP.albedoBuffer;
 
     float4 prev_color = accumPtr[fbOfs];
-    // float4 prev_normal = normalPtr[fbOfs];
-    // float4 prev_albedo = albedoPtr[fbOfs];
+    float4 prev_normal = normalPtr[fbOfs];
+    float4 prev_albedo = albedoPtr[fbOfs];
     float4 accum_color;
 
     if (LP.renderDataMode == RenderDataFlags::NONE) 
@@ -1607,27 +1632,27 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     }
     
     
-    // // compute screen space normal / albedo
-    // vec4 oldAlbedo = make_vec4(prev_albedo);
-    // vec4 oldNormal = make_vec4(prev_normal);
-    // if (any(isnan(oldAlbedo))) oldAlbedo = vec4(0.f);
-    // if (any(isnan(oldNormal))) oldNormal = vec4(0.f);
-    // vec4 newAlbedo = vec4(primaryAlbedo.x, primaryAlbedo.y, primaryAlbedo.z, 1.f);
-    // vec4 accumAlbedo = (newAlbedo + float(LP.frameID) * oldAlbedo) / float(LP.frameID + 1);
-    // vec4 newNormal = vec4(make_vec3(primaryNormal), 1.f);
-    // if (!all(equal(make_vec3(primaryNormal), vec3(0.f, 0.f, 0.f)))) {
-    //     glm::quat r0 = glm::quat_cast(LP.viewT0);
-    //     glm::quat r1 = glm::quat_cast(LP.viewT1);
-    //     glm::quat rot = (glm::all(glm::equal(r0, r1))) ? r0 : glm::slerp(r0, r1, time);
-    //     vec3 tmp = normalize(glm::mat3_cast(rot) * make_vec3(primaryNormal));
-    //     tmp = normalize(vec3(LP.proj * vec4(tmp, 0.f)));
-    //     newNormal = vec4(tmp, 1.f);
-    // }
-    // vec4 accumNormal = (newNormal + float(LP.frameID) * oldNormal) / float(LP.frameID + 1);
+    // compute screen space normal / albedo
+    vec4 oldAlbedo = make_vec4(prev_albedo);
+    vec4 oldNormal = make_vec4(prev_normal);
+    if (any(isnan(oldAlbedo))) oldAlbedo = vec4(0.f);
+    if (any(isnan(oldNormal))) oldNormal = vec4(0.f);
+    vec4 newAlbedo = vec4(primaryAlbedo.x, primaryAlbedo.y, primaryAlbedo.z, 1.f);
+    vec4 accumAlbedo = (newAlbedo + float(LP.frameID) * oldAlbedo) / float(LP.frameID + 1);
+    vec4 newNormal = vec4(make_vec3(primaryNormal), 1.f);
+    if (!all(equal(make_vec3(primaryNormal), vec3(0.f, 0.f, 0.f)))) {
+        glm::quat r0 = glm::quat_cast(LP.viewT0);
+        glm::quat r1 = glm::quat_cast(LP.viewT1);
+        glm::quat rot = (glm::all(glm::equal(r0, r1))) ? r0 : glm::slerp(r0, r1, time);
+        vec3 tmp = normalize(glm::mat3_cast(rot) * make_vec3(primaryNormal));
+        tmp = normalize(vec3(LP.proj * vec4(tmp, 0.f)));
+        newNormal = vec4(tmp, 1.f);
+    }
+    vec4 accumNormal = (newNormal + float(LP.frameID) * oldNormal) / float(LP.frameID + 1);
 
     // save data to frame buffers
     accumPtr[fbOfs] = accum_color;
     fbPtr[fbOfs] = accum_color;
-    // albedoPtr[fbOfs] = make_float4(accumAlbedo);
-    // normalPtr[fbOfs] = make_float4(accumNormal);    
+    albedoPtr[fbOfs] = make_float4(accumAlbedo);
+    normalPtr[fbOfs] = make_float4(accumNormal);    
 }
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index c3d0923b..f7a5478d 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -23,7 +23,6 @@
 #define PBRLUT_IMPLEMENTATION
 #include <nvisii/utilities/ggx_lookup_tables.h>
 #include <nvisii/utilities/procedural_sky.h>
-#include <nvisii/utilities/work_distribution.h>
 
 #include <thread>
 #include <future>
@@ -90,7 +89,8 @@ static struct OptixData {
     LaunchParams LP;
     GLuint imageTexID = -1;
     cudaGraphicsResource_t cudaResourceTex;
-    OWLBuffer sampleIndexBuffer;
+    bool resourceSharingSuccessful = true;
+    OWLBuffer assignmentBuffer;
 
     OWLBuffer frameBuffer;
     OWLBuffer normalBuffer;
@@ -99,6 +99,10 @@ static struct OptixData {
     OWLBuffer mvecBuffer;
     OWLBuffer accumBuffer;
 
+    OWLBuffer combinedFrameBuffer;
+    OWLBuffer combinedNormalBuffer;
+    OWLBuffer combinedAlbedoBuffer;
+
     OWLBuffer entityBuffer;
     OWLBuffer transformBuffer;
     OWLBuffer cameraBuffer;
@@ -187,7 +191,10 @@ static struct NVISII {
     bool headlessMode;
     std::function<void()> callback;
     std::recursive_mutex callbackMutex;
-    StaticWorkDistribution wd;
+
+    std::vector<std::pair<cudaEvent_t, cudaEvent_t>> events;
+    std::vector<float> times;
+    std::vector<float> weights;
 } NVISII;
 
 void applyStyle()
@@ -347,14 +354,14 @@ owl4x3f glmToOWL(glm::mat4 &xfm){
     return oxfm;
 }
 
-void synchronizeDevices()
+void synchronizeDevices(std::string error_string = "")
 {
     for (int i = 0; i < getDeviceCount(); i++) {
         cudaSetDevice(i);
         cudaDeviceSynchronize();
         cudaError_t err = cudaPeekAtLastError();
         if (err != 0) {
-            std::cout<< "ERROR: " << cudaGetErrorString(err)<<std::endl;
+            std::cout<< "ERROR " << error_string << ": " << cudaGetErrorString(err)<<std::endl;
             throw std::runtime_error(std::string("ERROR: ") + cudaGetErrorString(err));
         }
     }
@@ -373,13 +380,18 @@ void checkForErrors()
 }
 
 void initializeFrameBuffer(int fbWidth, int fbHeight) {
+    cudaSetDevice(0);
+
     fbWidth = glm::max(fbWidth, 1);
     fbHeight = glm::max(fbHeight, 1);
     synchronizeDevices();
 
     auto &OD = OptixData;
     if (OD.imageTexID != -1) {
-        cudaGraphicsUnregisterResource(OD.cudaResourceTex);
+        if (OptixData.cudaResourceTex && OptixData.resourceSharingSuccessful) {
+            cudaGraphicsUnregisterResource(OptixData.cudaResourceTex);
+            OptixData.cudaResourceTex = 0;
+        }
         glDeleteTextures(1, &OD.imageTexID);
     }
     
@@ -400,28 +412,33 @@ void initializeFrameBuffer(int fbWidth, int fbHeight) {
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
     //Registration with CUDA
-    cudaGraphicsGLRegisterImage(&OD.cudaResourceTex, OD.imageTexID, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsNone);
-    
+    static bool renderToHDRDeprecatedShown = false;
+    cudaError_t rc = cudaGraphicsGLRegisterImage(&OD.cudaResourceTex, OD.imageTexID, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsNone);
+    if (rc != cudaSuccess) {
+        std::string err = cudaGetErrorString(cudaGetLastError());
+        if (verbose && !renderToHDRDeprecatedShown) {
+            std::cout
+                  << "Warning: Could not do CUDA graphics resource sharing "
+                  << "for the display buffer texture ("
+                  << err
+                  << ")... falling back to slower path"
+                  << std::endl;
+            renderToHDRDeprecatedShown = true;
+        }
+        OD.resourceSharingSuccessful = false;
+        if (OD.cudaResourceTex) {
+          cudaGraphicsUnregisterResource(OD.cudaResourceTex);
+          OD.cudaResourceTex = 0;
+        }
+    } else {
+        OD.resourceSharingSuccessful = true;
+    }
     synchronizeDevices();
 }
 
-extern "C" void fillSamplesCUDA(
-        int32_t  num_samples,
-        cudaStream_t stream,
-        int32_t  gpu_idx,
-        int32_t  num_gpus,
-        int32_t  width,
-        int32_t  height,
-        int2*    samples );
-
 void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
 {
     auto &OD = OptixData;
-    uint32_t numGPUs = owlGetDeviceCount(OD.context);
-
-    NVISII.wd.setRasterSize( width, height );
-    NVISII.wd.setNumGPUs( numGPUs );
-
     OD.LP.frameSize.x = width;
     OD.LP.frameSize.y = height;
     owlBufferResize(OD.frameBuffer, width * height);
@@ -430,22 +447,10 @@ void resizeOptixFrameBuffer(uint32_t width, uint32_t height)
     owlBufferResize(OD.scratchBuffer, width * height);
     owlBufferResize(OD.mvecBuffer, width * height);    
     owlBufferResize(OD.accumBuffer, width * height);
-    owlBufferResize(OD.sampleIndexBuffer, width * height);
 
-    for (uint32_t i = 0; i < numGPUs; ++i)
-    {
-        cudaSetDevice( i );
-        fillSamplesCUDA(
-            NVISII.wd.numSamples(i),
-            owlContextGetStream(OD.context, i),
-            i,
-            numGPUs,
-            width,
-            height,
-            (int2*)owlBufferGetPointer(OD.sampleIndexBuffer, i)
-        );
-    }
-    cudaSetDevice(0);
+    owlBufferResize(OD.combinedFrameBuffer, width * height);
+    owlBufferResize(OD.combinedNormalBuffer, width * height);
+    owlBufferResize(OD.combinedAlbedoBuffer, width * height);
 
     // Reconfigure denoiser
     optixDenoiserComputeMemoryResources(OD.denoiser, OD.LP.frameSize.x, OD.LP.frameSize.y, &OD.denoiserSizes);
@@ -507,7 +512,7 @@ void initializeOptix(bool headless)
     
     /* Setup Optix Launch Params */
     OWLVarDecl launchParamVars[] = {
-        { "sampleIndexBuffer",       OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, sampleIndexBuffer)},
+        { "assignmentBuffer",        OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, assignmentBuffer)},
         { "frameSize",               OWL_USER_TYPE(glm::ivec2),         OWL_OFFSETOF(LaunchParams, frameSize)},
         { "frameID",                 OWL_USER_TYPE(uint64_t),           OWL_OFFSETOF(LaunchParams, frameID)},
         { "frameBuffer",             OWL_BUFPTR,                        OWL_OFFSETOF(LaunchParams, frameBuffer)},
@@ -579,17 +584,33 @@ void initializeOptix(bool headless)
         initializeFrameBuffer(512, 512);        
     }
 
-    NVISII.wd.setRasterSize( 512, 512 );
-    NVISII.wd.setNumGPUs( owlGetDeviceCount(OD.context) );
-    OD.sampleIndexBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(int2), 512*512, nullptr);
-    owlParamsSetBuffer(OD.launchParams, "sampleIndexBuffer", OD.sampleIndexBuffer);
+    OD.assignmentBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(float), owlGetDeviceCount(OD.context) + 1, nullptr);
+    owlParamsSetBuffer(OD.launchParams, "assignmentBuffer", OD.assignmentBuffer);
+
+    // If we only have one GPU, framebuffer pixels can stay on device 0. 
+    if (numGPUsFound == 1) {
+        OD.frameBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+        OD.accumBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+        OD.normalBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+        OD.albedoBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+        OD.scratchBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+        OD.mvecBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    }
+    // Otherwise, multiple GPUs must use host pinned memory to merge partial framebuffers together
+    else {
+        OD.frameBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+        OD.accumBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+        OD.normalBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+        OD.albedoBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+        OD.scratchBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+        OD.mvecBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
+    }
+
+    // For multiGPU denoising, its best to denoise using something other than zero-copy memory.
+    OD.combinedFrameBuffer = owlManagedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.combinedNormalBuffer = owlManagedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
+    OD.combinedAlbedoBuffer = owlManagedMemoryBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
 
-    OD.frameBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
-    OD.accumBuffer = owlDeviceBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512, nullptr);
-    OD.normalBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
-    OD.albedoBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
-    OD.scratchBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
-    OD.mvecBuffer = owlHostPinnedBufferCreate(OD.context,OWL_USER_TYPE(glm::vec4),512*512);
     OD.LP.frameSize = glm::ivec2(512, 512);
     owlParamsSetBuffer(OD.launchParams, "frameBuffer", OD.frameBuffer);
     owlParamsSetBuffer(OD.launchParams, "normalBuffer", OD.normalBuffer);
@@ -779,6 +800,19 @@ void initializeOptix(bool headless)
     setDomeLightSky(glm::vec3(0,0,10));
 
     OptixData.LP.sceneBBMin = OptixData.LP.sceneBBMax = glm::vec3(0.f);
+
+    // To measure how long each card takes to trace for load balancing
+    int numGPUs = owlGetDeviceCount(OptixData.context);
+    for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+        cudaSetDevice(deviceID);
+        cudaEvent_t start, stop;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        NVISII.events.push_back({start, stop});
+        NVISII.times.push_back(1.f);
+        NVISII.weights.push_back(1.f / float(numGPUs));
+    }
+    cudaSetDevice(0);
 }
 
 void initializeImgui()
@@ -844,6 +878,41 @@ void processCommandQueue()
     }
 }
 
+void updateGPUWeights()
+{
+    int num_gpus = owlGetDeviceCount(OptixData.context);
+    float target = 1.f / float(num_gpus);
+    
+    std::vector<float> signals(num_gpus);
+    float total_time = 0.f;
+    for (uint32_t i = 0; i < num_gpus; ++i) total_time += NVISII.times[i];
+    for (uint32_t i = 0; i < num_gpus; ++i) signals[i] = NVISII.times[i] / float(total_time);
+
+    std::vector<float> p_error(num_gpus);
+    for (uint32_t i = 0; i < num_gpus; ++i) p_error[i] = target - signals[i];
+
+    // update weights 
+    float pK = 1.f;
+    for (uint32_t i = 0; i < num_gpus; ++i) {
+        NVISII.weights[i] = max(NVISII.weights[i] + p_error[i], .001f);
+    }
+
+    std::vector<float> scan;
+    for (size_t i = 0; i <= num_gpus; ++i) {
+        if (i == 0) scan.push_back(0.f);
+        else scan.push_back(scan[i - 1] + NVISII.weights[i - 1]);
+    }
+
+    // std::cout<<"Scan: ";
+    for (size_t i = 0; i <= num_gpus; ++i) {
+        scan[i] /= scan[num_gpus];
+        // std::cout<<scan[i] << " ";
+    }
+    // std::cout<<std::endl;
+
+    owlBufferUpload(OptixData.assignmentBuffer, scan.data());
+}
+
 void setCameraEntity(Entity* camera_entity)
 {
     if (!camera_entity) {
@@ -1593,77 +1662,19 @@ void updateLaunchParams()
 // Moving to that approach...
 // // Different GPUs have different local framebuffers.
 // // This function combines those framebuffers on the CPU, then uploads results to device 0.
-// void mergeFrameBuffers() {
-//     int deviceCount = getDeviceCount();
-//     int width = OptixData.LP.frameSize.x;
-//     int height = OptixData.LP.frameSize.y;
-//     if (deviceCount <= 1) return;
-
-//     // synchronizeDevices();
-
-//     std::vector<glm::vec4> fb_h(width * height);
-//     std::vector<glm::vec4> fba_h(width * height);
-//     std::vector<glm::vec4> fbn_h(width * height);
-//     std::vector<std::vector<glm::vec4>> fb_hd(deviceCount);
-//     std::vector<std::vector<glm::vec4>> fba_hd(deviceCount);
-//     std::vector<std::vector<glm::vec4>> fbn_hd(deviceCount);
-//     for (uint32_t i = 0; i < deviceCount; ++i){
-//         fb_hd[i] = std::vector<glm::vec4>(width * height);
-//         fba_hd[i] = std::vector<glm::vec4>(width * height);
-//         fbn_hd[i] = std::vector<glm::vec4>(width * height);
-//         void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,i);
-//         void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,i);
-//         void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,i);
-//         cudaMemcpyAsync((void*)fb_hd[i].data(), (void*)fb_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-//         cudaMemcpyAsync((void*)fba_hd[i].data(), (void*)fba_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-//         cudaMemcpyAsync((void*)fbn_hd[i].data(), (void*)fbn_d, fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-//     }
-//     // synchronizeDevices();
-
-//     // note, GPUs render 32xN strips
-//     for (uint32_t y = 0; y < height; ++y) {
-//         for (uint32_t x = 0; x < width; x += 32) {
-//             if (x >= width) continue;
-//             int deviceThatIsResponsible = (x>>5) % deviceCount;
-//             {
-//                 glm::vec4* A = fb_h.data() + (y * width) + x;
-//                 glm::vec4* B = fb_hd[deviceThatIsResponsible].data() + (y * width) + x;
-//                 memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
-//             }
-//             {
-//                 glm::vec4* A = fba_h.data() + (y * width) + x;
-//                 glm::vec4* B = fba_hd[deviceThatIsResponsible].data() + (y * width) + x;
-//                 memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
-//             }
-//             {
-//                 glm::vec4* A = fbn_h.data() + (y * width) + x;
-//                 glm::vec4* B = fbn_hd[deviceThatIsResponsible].data() + (y * width) + x;
-//                 memcpy(A, B, min(32, int(width - x)) * sizeof(glm::vec4));
-//             }
-//         }
-//     }
+void mergeFrameBuffers() {
+    // For multigpu setups, we currently render to zero-copy memory to merge on the host.
+    // So for now, just upload those results to device 0's combined unified frame buffers on the device
+    owlBufferUpload(OptixData.combinedFrameBuffer, owlBufferGetPointer(OptixData.frameBuffer, 0));
+    
+    if (OptixData.enableAlbedoGuide) {
+        owlBufferUpload(OptixData.combinedAlbedoBuffer, owlBufferGetPointer(OptixData.albedoBuffer, 0));
+    }
 
-//     // // note, GPUs render 32xN strips
-//     // for (uint32_t y = 0; y < height; ++y) {
-//     //     for (uint32_t x = 0; x < width; x += 32) {
-//     //         int deviceThatIsResponsible = (x>>5) % deviceCount;
-//     //         glm::vec4* A = fb_h.data() + (y * width) + x;
-//     //         glm::vec4* B = ((glm::vec4*)fb_d[deviceThatIsResponsible]) + (y * width) + x;
-//     //         cudaMemcpyAsync((void*)A, (void*)B, min(32, int(width - x)) * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-//     //     }
-//     // }
-
-
-//     // cudaMemcpyAsync(fb_h.data(), fb_d[1], fb_h.size() * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-//     synchronizeDevices();
-//     void* fb_d = (void*)owlBufferGetPointer(OptixData.frameBuffer,0);
-//     void* fba_d = (void*)owlBufferGetPointer(OptixData.albedoBuffer,0);
-//     void* fbn_d = (void*)owlBufferGetPointer(OptixData.normalBuffer,0);
-//     cudaMemcpyAsync(fb_d, fb_h.data(), fb_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
-//     cudaMemcpyAsync(fba_d, fba_h.data(), fba_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
-//     cudaMemcpyAsync(fbn_d, fbn_h.data(), fbn_h.size() * sizeof(glm::vec4), cudaMemcpyHostToDevice);
-//     // synchronizeDevices();
-// }
+    if (OptixData.enableNormalGuide) {
+        owlBufferUpload(OptixData.combinedNormalBuffer, owlBufferGetPointer(OptixData.normalBuffer, 0));
+    }
+}
 
 void denoiseImage() {
     synchronizeDevices();
@@ -1671,8 +1682,6 @@ void denoiseImage() {
     auto &OD = OptixData;
     auto cudaStream = owlContextGetStream(OD.context, 0);
 
-    CUdeviceptr frameBuffer = (CUdeviceptr) owlBufferGetPointer(OD.frameBuffer, 0);
-
     std::vector<OptixImage2D> inputLayers;
     OptixImage2D colorLayer;
     colorLayer.width = OD.LP.frameSize.x;
@@ -1680,7 +1689,7 @@ void denoiseImage() {
     colorLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;
     colorLayer.pixelStrideInBytes = 4 * sizeof(float);
     colorLayer.rowStrideInBytes   = OD.LP.frameSize.x * 4 * sizeof(float);
-    colorLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.frameBuffer, 0);
+    colorLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.combinedFrameBuffer, 0);
     inputLayers.push_back(colorLayer);
 
     OptixImage2D albedoLayer;
@@ -1689,7 +1698,7 @@ void denoiseImage() {
     albedoLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;
     albedoLayer.pixelStrideInBytes = 4 * sizeof(float);
     albedoLayer.rowStrideInBytes   = OD.LP.frameSize.x * 4 * sizeof(float);
-    albedoLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.albedoBuffer, 0);
+    albedoLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.combinedAlbedoBuffer, 0);
     if (OD.enableAlbedoGuide) inputLayers.push_back(albedoLayer);
 
     OptixImage2D normalLayer;
@@ -1698,7 +1707,7 @@ void denoiseImage() {
     normalLayer.format = OPTIX_PIXEL_FORMAT_FLOAT4;
     normalLayer.pixelStrideInBytes = 4 * sizeof(float);
     normalLayer.rowStrideInBytes   = OD.LP.frameSize.x * 4 * sizeof(float);
-    normalLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.normalBuffer, 0);
+    normalLayer.data   = (CUdeviceptr) owlBufferGetPointer(OD.combinedNormalBuffer, 0);
     if (OD.enableNormalGuide) inputLayers.push_back(normalLayer);
 
     OptixImage2D outputLayer = colorLayer; // can I get away with this?
@@ -1755,23 +1764,88 @@ void denoiseImage() {
         (CUdeviceptr) owlBufferGetPointer(OD.denoiserScratchBuffer, 0),
         scratchSizeInBytes
     ));
+}
 
-    synchronizeDevices();
+inline const char* getGLErrorString( GLenum error )
+{
+    switch( error )
+    {
+    case GL_NO_ERROR:            return "No error";
+    case GL_INVALID_ENUM:        return "Invalid enum";
+    case GL_INVALID_VALUE:       return "Invalid value";
+    case GL_INVALID_OPERATION:   return "Invalid operation";
+        //case GL_STACK_OVERFLOW:      return "Stack overflow";
+        //case GL_STACK_UNDERFLOW:     return "Stack underflow";
+    case GL_OUT_OF_MEMORY:       return "Out of memory";
+        //case GL_TABLE_TOO_LARGE:     return "Table too large";
+    default:                     return "Unknown GL error";
+    }
 }
 
+#define DO_GL_CHECK
+#ifdef DO_GL_CHECK
+#    define GL_CHECK( call )                                            \
+    do                                                                  \
+      {                                                                 \
+        call;                                                           \
+        GLenum err = glGetError();                                      \
+        if( err != GL_NO_ERROR )                                        \
+          {                                                             \
+            std::stringstream ss;                                       \
+            ss << "GL error " <<  getGLErrorString( err ) << " at "     \
+               << __FILE__  << "(" <<  __LINE__  << "): " << #call      \
+               << std::endl;                                            \
+            std::cerr << ss.str() << std::endl;                         \
+            throw std::runtime_error( ss.str().c_str() );               \
+          }                                                             \
+      }                                                                 \
+    while (0)
+
+
+#    define GL_CHECK_ERRORS( )                                          \
+    do                                                                  \
+      {                                                                 \
+        GLenum err = glGetError();                                      \
+        if( err != GL_NO_ERROR )                                        \
+          {                                                             \
+            std::stringstream ss;                                       \
+            ss << "GL error " <<  getGLErrorString( err ) << " at "     \
+               << __FILE__  << "(" <<  __LINE__  << ")";                \
+            std::cerr << ss.str() << std::endl;                         \
+            throw std::runtime_error( ss.str().c_str() );               \
+          }                                                             \
+      }                                                                 \
+    while (0)
+
+#else
+#    define GL_CHECK( call )   do { call; } while(0)
+#    define GL_CHECK_ERRORS( ) do { ;     } while(0)
+#endif
+
 void drawFrameBufferToWindow()
 {
     synchronizeDevices();
     glFlush();
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
     auto &OD = OptixData;
-    cudaGraphicsMapResources(1, &OD.cudaResourceTex);
-    const void* fbdevptr = owlBufferGetPointer(OD.frameBuffer,0);
-    cudaArray_t array;
-    cudaGraphicsSubResourceGetMappedArray(&array, OD.cudaResourceTex, 0, 0);
-    cudaMemcpyToArray(array, 0, 0, fbdevptr, OD.LP.frameSize.x *  OD.LP.frameSize.y  * sizeof(glm::vec4), cudaMemcpyDeviceToDevice);
-    cudaGraphicsUnmapResources(1, &OD.cudaResourceTex);
-    
+    const void* fbdevptr = owlBufferGetPointer(OD.combinedFrameBuffer,0);
+
+    if (OD.resourceSharingSuccessful) {
+        cudaGraphicsMapResources(1, &OD.cudaResourceTex);
+        cudaArray_t array;
+        cudaGraphicsSubResourceGetMappedArray(&array, OD.cudaResourceTex, 0, 0);
+        cudaMemcpyToArray(array, 0, 0, fbdevptr, OD.LP.frameSize.x *  OD.LP.frameSize.y  * sizeof(glm::vec4), cudaMemcpyDeviceToDevice);
+        cudaGraphicsUnmapResources(1, &OD.cudaResourceTex);
+    } else {
+        GL_CHECK(glBindTexture(GL_TEXTURE_2D, OD.imageTexID));
+        glEnable(GL_TEXTURE_2D);
+        GL_CHECK(glTexSubImage2D(GL_TEXTURE_2D,0,
+                                    0, 0,
+                                    OD.LP.frameSize.x, OD.LP.frameSize.y,
+                                    GL_RGBA, GL_FLOAT, fbdevptr));    
+    }
+
     // Draw pixels from optix frame buffer
     glEnable(GL_FRAMEBUFFER_SRGB); 
     glViewport(0, 0, OD.LP.frameSize.x, OD.LP.frameSize.y);
@@ -1784,7 +1858,6 @@ void drawFrameBufferToWindow()
     glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
             
     glDisable(GL_DEPTH_TEST);    
-    glBindTexture(GL_TEXTURE_2D, OD.imageTexID);
     
     // Draw texture to screen via immediate mode
     glEnable(GL_TEXTURE_2D);
@@ -1980,6 +2053,7 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
         resizeOptixFrameBuffer(width, height);
         resetAccumulation();
         updateComponents();
+        int numGPUs = owlGetDeviceCount(OptixData.context);
 
         for (uint32_t i = 0; i < samplesPerPixel; ++i) {
             // std::cout<<i<<std::endl;
@@ -1992,10 +2066,18 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
             }
 
             updateLaunchParams();
-            for (uint32_t deviceID = 0; deviceID < owlGetDeviceCount(OptixData.context); deviceID++) {
-                owlAsyncLaunch2DOnDevice(OptixData.rayGen, NVISII.wd.numSamples(deviceID), 1, deviceID, OptixData.launchParams);
+            for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+                cudaSetDevice(deviceID);
+                cudaEventRecord(NVISII.events[deviceID].first);
+                owlAsyncLaunch2DOnDevice(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, deviceID, OptixData.launchParams);
+                cudaEventRecord(NVISII.events[deviceID].second);
+            }
+            for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+                cudaEventSynchronize(NVISII.events[deviceID].second);
+                cudaEventElapsedTime(&NVISII.times[deviceID], NVISII.events[deviceID].first, NVISII.events[deviceID].second);
             }
-            owlLaunchSync(OptixData.launchParams);
+            updateGPUWeights();
+            mergeFrameBuffers();
 
             if (!NVISII.headlessMode) {
                 if (OptixData.enableDenoiser)
@@ -2022,17 +2104,14 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
             std::cout<<"\r "<< samplesPerPixel << "/" << samplesPerPixel <<" - done!" << std::endl;
         }
 
-        synchronizeDevices();
-
         if (OptixData.enableDenoiser)
         {
             denoiseImage();
         }
 
+        synchronizeDevices();
         const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
         cudaMemcpyAsync(frameBuffer.data(), fb, width * height * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
-
-        synchronizeDevices();
     });
 
     return frameBuffer;
@@ -2125,6 +2204,9 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
         else if (option == std::string("heatmap")) {
             OptixData.LP.renderDataMode = RenderDataFlags::HEATMAP;
         }
+        else if (option == std::string("device_id")) {
+            OptixData.LP.renderDataMode = RenderDataFlags::DEVICE_ID;
+        }
         else {
             throw std::runtime_error(std::string("Error, unknown option : \"") + _option + std::string("\". ")
             + std::string("See documentation for available options"));
@@ -2135,6 +2217,7 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
         OptixData.LP.renderDataBounce = bounce;
         OptixData.LP.seed = seed;
         updateComponents();
+        int numGPUs = owlGetDeviceCount(OptixData.context);
 
         for (uint32_t i = startFrame; i < frameCount; ++i) {
             // std::cout<<i<<std::endl;
@@ -2148,10 +2231,18 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
 
             updateLaunchParams();
 
-            for (uint32_t deviceID = 0; deviceID < owlGetDeviceCount(OptixData.context); deviceID++) {
-                owlAsyncLaunch2DOnDevice(OptixData.rayGen, NVISII.wd.numSamples(deviceID), 1, deviceID, OptixData.launchParams);
+            for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+                cudaSetDevice(deviceID);
+                cudaEventRecord(NVISII.events[deviceID].first);
+                owlAsyncLaunch2DOnDevice(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, deviceID, OptixData.launchParams);
+                cudaEventRecord(NVISII.events[deviceID].second);
+            }
+            for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+                cudaEventSynchronize(NVISII.events[deviceID].second);
+                cudaEventElapsedTime(&NVISII.times[deviceID], NVISII.events[deviceID].first, NVISII.events[deviceID].second);
             }
-            owlLaunchSync(OptixData.launchParams);
+            updateGPUWeights();
+            mergeFrameBuffers();
             
             // Dont run denoiser to raw data rendering
             // if (OptixData.enableDenoiser)
@@ -2427,9 +2518,10 @@ void initializeInteractive(
         glfw->poll_events();
 
         initializeOptix(/*headless = */ false);
-
         initializeImgui();
 
+        int numGPUs = owlGetDeviceCount(OptixData.context);
+
         while (!stopped)
         {
             /* Poll events from the window */
@@ -2451,15 +2543,23 @@ void initializeInteractive(
                 updateFrameBuffer();
                 updateComponents();
                 updateLaunchParams();
-                for (uint32_t deviceID = 0; deviceID < owlGetDeviceCount(OptixData.context); deviceID++) {
-                    owlAsyncLaunch2DOnDevice(OptixData.rayGen, NVISII.wd.numSamples(deviceID), 1, deviceID, OptixData.launchParams);
+
+                for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+                    cudaSetDevice(deviceID);
+                    cudaEventRecord(NVISII.events[deviceID].first, owlParamsGetCudaStream(OptixData.launchParams, deviceID));
+                    owlAsyncLaunch2DOnDevice(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, deviceID, OptixData.launchParams);
+                    cudaEventRecord(NVISII.events[deviceID].second, owlParamsGetCudaStream(OptixData.launchParams, deviceID));
                 }
                 owlLaunchSync(OptixData.launchParams);
-                
-                if (OptixData.enableDenoiser)
-                {
+                for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
+                    cudaEventElapsedTime(&NVISII.times[deviceID], NVISII.events[deviceID].first, NVISII.events[deviceID].second);
+                }
+                updateGPUWeights();
+                mergeFrameBuffers();
+
+                if (OptixData.enableDenoiser) {
                     denoiseImage();
-                }        
+                }
             }
             // glm::vec4* samplePtr = (glm::vec4*) owlBufferGetPointer(OptixData.accumBuffer,0);
             // glm::vec4* mvecPtr = (glm::vec4*) owlBufferGetPointer(OptixData.mvecBuffer,0);
@@ -2485,7 +2585,10 @@ void initializeInteractive(
             OPTIX_CHECK(optixDenoiserDestroy(OptixData.denoiser));
 
         if (OptixData.imageTexID != -1) {
-            cudaGraphicsUnregisterResource(OptixData.cudaResourceTex);
+            if (OptixData.cudaResourceTex) {
+                cudaGraphicsUnregisterResource(OptixData.cudaResourceTex);
+                OptixData.cudaResourceTex = 0;
+            }
             glDeleteTextures(1, &OptixData.imageTexID);
         }
 
@@ -2647,12 +2750,12 @@ void updateSceneAabb(Entity* entity)
 
 void enableUpdates()
 {
-    enqueueCommand([] () { lazyUpdatesEnabled = false; });
+    enqueueCommandAndWait([] () { lazyUpdatesEnabled = false; });
 }
 
 void disableUpdates()
 {
-    enqueueCommand([] () { lazyUpdatesEnabled = true; });
+    enqueueCommandAndWait([] () { lazyUpdatesEnabled = true; });
 }
 
 bool areUpdatesEnabled()
@@ -2819,6 +2922,9 @@ void __test__(std::vector<std::string> args) {
     else if (option == std::string("heatmap")) {
         OptixData.LP.renderDataMode = RenderDataFlags::HEATMAP;
     }
+    else if (option == std::string("device_id")) {
+        OptixData.LP.renderDataMode = RenderDataFlags::DEVICE_ID;
+    }
     else {
         throw std::runtime_error(std::string("Error, unknown option : \"") + option + std::string("\". ")
         + std::string("See documentation for available options"));
diff --git a/src/nvisii/nvisii.cu b/src/nvisii/nvisii.cu
index 0d81d5d2..15f41939 100644
--- a/src/nvisii/nvisii.cu
+++ b/src/nvisii/nvisii.cu
@@ -54,38 +54,3 @@ void reproject(glm::vec4 *sampleBuffer, glm::vec4 *t0AlbedoBuffer, glm::vec4 *t1
     _reproject<<<gridSize,blockSize>>>(sampleBuffer, t0AlbedoBuffer, t1AlbedoBuffer, mvecBuffer, scratchBuffer, imageBuffer, true, width, height);
     _reproject<<<gridSize,blockSize>>>(sampleBuffer, t0AlbedoBuffer, t1AlbedoBuffer, mvecBuffer, scratchBuffer, imageBuffer, false, width, height);
 }
-
-#include "work_distribution.h"
-
-extern "C" __global__ void fillSamples(
-        int   gpu_idx,
-        int   num_gpus,
-        int   width,
-        int   height,
-        int2* sample_indices )
-{
-    StaticWorkDistribution wd;
-    wd.setRasterSize( width, height );
-    wd.setNumGPUs( num_gpus );
-
-    const int sample_idx = blockIdx.x;
-    sample_indices[sample_idx] = wd.getSamplePixel( gpu_idx, sample_idx );
-}
-
-
-extern "C" __host__ void fillSamplesCUDA(
-        int          num_samples,
-        cudaStream_t stream,
-        int          gpu_idx,
-        int          num_gpus,
-        int          width,
-        int          height,
-        int2*        sample_indices )
-{
-    fillSamples<<<num_samples, 1, 0, stream>>>(
-        gpu_idx,
-        num_gpus,
-        width,
-        height,
-        sample_indices );
-}

From 8a386fc091d744c75a2786e6e439f443eeb9ae17 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Fri, 2 Apr 2021 18:44:05 -0600
Subject: [PATCH 07/55] adding some sync points

---
 src/nvisii/nvisii.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 3f415090..7cd2ad85 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1948,6 +1948,9 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
     if ((width < 1) || (height < 1)) throw std::runtime_error("Error, invalid width/height");
     std::vector<float> frameBuffer(width * height * 4);
 
+    // flush command queue
+    enqueueCommandAndWait([](){});
+
     enqueueCommandAndWait([&frameBuffer, width, height, samplesPerPixel, seed] () {
         if (!NVISII.headlessMode) {
             if ((width != WindowData.currentSize.x) || (height != WindowData.currentSize.y))
@@ -2033,6 +2036,8 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
 {
     std::vector<float> frameBuffer(width * height * 4);
 
+    enqueueCommandAndWait([](){});
+
     enqueueCommandAndWait([&frameBuffer, width, height, startFrame, frameCount, bounce, _option, seed] () {
         if (!NVISII.headlessMode) {
             if ((width != WindowData.currentSize.x) || (height != WindowData.currentSize.y))

From bc5d1af6c73ba2ed5237c21b12611d43ca969bf2 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Fri, 2 Apr 2021 22:00:37 -0600
Subject: [PATCH 08/55] Adding support for constant material parameters with
 import_scene function

---
 include/nvisii/nvisii.h            |  6 +--
 src/nvisii/light.cpp               |  6 +--
 src/nvisii/nvisii_import_scene.cpp | 65 ++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/include/nvisii/nvisii.h b/include/nvisii/nvisii.h
index d15f6220..e14a2a97 100644
--- a/include/nvisii/nvisii.h
+++ b/include/nvisii/nvisii.h
@@ -378,11 +378,11 @@ struct Scene {
  * 
  * @param filepath The path for the file to load
  * @param position A change in position to apply to all entities generated by this function
- * @param position A change in scale to apply to all entities generated by this function
- * @param position A change in rotation to apply to all entities generated by this function
+ * @param scale A change in scale to apply to all entities generated by this function
+ * @param rotation A change in rotation to apply to all entities generated by this function
  * @param args A list of optional arguments that can effect the importer. 
  * Possible options include: 
- * "verbose" - print out information related to loading the scene.
+ * "verbose" - print out information related to loading the scene. Useful for debugging!
 */
 Scene importScene(
         std::string file_path,
diff --git a/src/nvisii/light.cpp b/src/nvisii/light.cpp
index 01e3ab02..842981b8 100644
--- a/src/nvisii/light.cpp
+++ b/src/nvisii/light.cpp
@@ -46,9 +46,9 @@ LightStruct &Light::getStruct() {
 void Light::setColor(glm::vec3 color)
 {
     auto &light = getStruct();
-    light.r = max(0.f, min(color.r, 1.f));
-    light.g = max(0.f, min(color.g, 1.f));
-    light.b = max(0.f, min(color.b, 1.f));
+    light.r = max(0.f, color.r);
+    light.g = max(0.f, color.g);
+    light.b = max(0.f, color.b);
     markDirty();
 }
 
diff --git a/src/nvisii/nvisii_import_scene.cpp b/src/nvisii/nvisii_import_scene.cpp
index 9b557d69..60bbd3a5 100644
--- a/src/nvisii/nvisii_import_scene.cpp
+++ b/src/nvisii/nvisii_import_scene.cpp
@@ -101,7 +101,7 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         nvisiiScene.materials.push_back(mat);
         material_light_map[mat] = nullptr;
         aiString Path;
-        
+
         // Diffuse/specular workflow
         if (material->GetTextureCount(aiTextureType_DIFFUSE) > 0) {
             if (material->GetTexture(aiTextureType_DIFFUSE, 0, &Path, NULL, NULL, NULL, NULL, NULL) == AI_SUCCESS) {
@@ -204,6 +204,62 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         auto name = std::string(material->GetName().C_Str());
         auto mat = nvisiiScene.materials[materialIdx];
         aiString Path;
+
+        if (verbose) std::cout<<"Creating material : " << name << std::endl;             
+
+        aiColor3D color (0.f,0.f,0.f);
+        if(AI_SUCCESS == material->Get(AI_MATKEY_COLOR_DIFFUSE, color)) {
+            mat->setBaseColor(glm::vec3(color.r, color.g, color.b));
+            if (verbose) std::cout<<"Assigning base color : " << color.r << " " << color.g << " " << color.b << std::endl;            
+        }
+        if(AI_SUCCESS == material->Get(AI_MATKEY_COLOR_EMISSIVE, color)) {
+            if (color.r > 0.f || color.g > 0.f || color.b > 0.f) {
+                if (verbose) std::cout<<"Assigning base color : " << color.r << " " << color.g << " " << color.b << std::endl;            
+                if (Light::get(mat->getName()) == nullptr) {
+                    Light::create(mat->getName());
+                    nvisiiScene.lights.push_back(material_light_map[mat]);
+                }
+                material_light_map[mat] = Light::get(mat->getName());
+                material_light_map[mat]->setColor(glm::vec3(color.r, color.g, color.b));
+            }
+        }
+        if(AI_SUCCESS == material->Get(AI_MATKEY_COLOR_SPECULAR, color)) {
+            if (color.r == color.b && color.r == color.g) {
+                if (verbose) std::cout<<"Setting constant specular: " << color.r << std::endl;
+                mat->setSpecular(color.r);
+            }
+            else if (verbose) {
+                std::cout<<"Error, colored specular found (not supported)" << std::endl;
+            }
+        }
+
+        float scalar;
+        if(AI_SUCCESS == material->Get(AI_MATKEY_SHININESS, scalar)) {
+            if (scalar != 0.f) {
+                if (verbose) std::cout<<"Interpreting shininess as 2/roughness^4 - 2: " << powf(2.f / (scalar + 2.f), 1.f/4.f) << std::endl;
+                mat->setRoughness(powf(2.f / (scalar + 2.f), 1.f/4.f));
+            }
+        }
+        float ior = 1.f;
+        if(AI_SUCCESS == material->Get(AI_MATKEY_REFRACTI, ior)) {
+            if (verbose) std::cout<<"Assigning index of refraction " << ior << std::endl;
+            mat->setIor(ior);
+        }
+
+        if(AI_SUCCESS == material->Get(AI_MATKEY_OPACITY, scalar)) {
+            if (scalar != 1.f) {
+                if (ior == 1) {
+                    if (verbose) std::cout<<"Assigning opacity " << scalar << std::endl;
+                    mat->setAlpha(scalar);
+                }
+                else {
+                    if (verbose) std::cout<<"IOR != 1.0, interpreting dissolve as transmission " << scalar << std::endl;
+                    mat->setTransmission(scalar);
+                }
+            }
+        }
+
+        
         
         // todo, add texture paths to map above, load later and connect
         if (material->GetTextureCount(aiTextureType_DIFFUSE) > 0) {
@@ -236,9 +292,12 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
                 std::string path = directory + "/" + std::string(Path.C_Str());
                 std::replace(path.begin(), path.end(), '\\', '/');
                 if (texture_map[path]) {
-                    material_light_map[mat] = Light::create(mat->getName());
+                    if (Light::get(mat->getName()) == nullptr) {
+                        Light::create(mat->getName());
+                        nvisiiScene.lights.push_back(material_light_map[mat]);
+                    }
+                    material_light_map[mat] = Light::get(mat->getName());
                     material_light_map[mat]->setColorTexture(texture_map[path]);
-                    nvisiiScene.lights.push_back(material_light_map[mat]);
                 }
             }
         }  

From 1991793a890fe5374913bc17e6b035ce226233f0 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Fri, 2 Apr 2021 23:59:19 -0600
Subject: [PATCH 09/55] updating owl submodule

---
 externals/owl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/externals/owl b/externals/owl
index 97d3acd6..f1d01daa 160000
--- a/externals/owl
+++ b/externals/owl
@@ -1 +1 @@
-Subproject commit 97d3acd6ad4d0e356f90959d34c8a94024a4fa70
+Subproject commit f1d01daa451151e6ac64c7f9d6662b354b467384

From 74ccee12d2fb92e788017c9d49f082db6694d8b7 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 3 Apr 2021 00:45:59 -0600
Subject: [PATCH 10/55] adding more visibility flags.

---
 include/nvisii/entity.h        | 16 ++++++++++++--
 include/nvisii/entity_struct.h |  5 +++++
 src/nvisii/entity.cpp          | 39 +++++++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/include/nvisii/entity.h b/include/nvisii/entity.h
index 5120184a..9fadab2a 100644
--- a/include/nvisii/entity.h
+++ b/include/nvisii/entity.h
@@ -211,9 +211,21 @@ class Entity : public StaticFactory {
 
 	/**
 	 * Objects can be set to be invisible to particular ray types:
-	 * @param camera Makes the object visible to camera rays
+	 * @param camera Makes the object visible to camera rays (the first rays to be traced from the camera).
+	 * @param diffuse (todo...) Makes the object visible to diffuse rays (eg for diffuse GI)
+	 * @param glossy (todo...) Makes the object visible to glossy rays (eg in reflections)
+	 * @param transmission (todo...) Makes the object visible to transmission rays (eg from inside glass)
+	 * @param volume_scatter (todo...) Makes the object visible to volume scatter rays (eg from light simulation inside a volume)
+	 * @param shadow Enables the object to cast shadows.
 	*/
-	void setVisibility(bool camera = true);
+	void setVisibility(
+		bool camera = true, 
+		bool diffuse = true, 
+		bool glossy = true, 
+		bool transmission = true, 
+		bool volume_scatter = true, 
+		bool shadow = true
+	);
 
 	/** @returns the minimum axis aligned bounding box position. Requires a transform and mesh component to be attached. */
 	glm::vec3 getMinAabbCorner();
diff --git a/include/nvisii/entity_struct.h b/include/nvisii/entity_struct.h
index 41658fa4..9e77c832 100644
--- a/include/nvisii/entity_struct.h
+++ b/include/nvisii/entity_struct.h
@@ -7,6 +7,11 @@
 #ifndef ENTITY_VISIBILITY_FLAGS
 #define ENTITY_VISIBILITY_FLAGS
 #define ENTITY_VISIBILITY_CAMERA_RAYS (1<<0)
+#define ENTITY_VISIBILITY_DIFFUSE_RAYS (1<<1)
+#define ENTITY_VISIBILITY_GLOSSY_RAYS (1<<2)
+#define ENTITY_VISIBILITY_TRANSMISSION_RAYS (1<<3)
+#define ENTITY_VISIBILITY_VOLUME_SCATTER_RAYS (1<<4)
+#define ENTITY_VISIBILITY_SHADOW_RAYS (1<<5)
 #endif
 
 struct EntityStruct {
diff --git a/src/nvisii/entity.cpp b/src/nvisii/entity.cpp
index 52987292..aac5d55c 100644
--- a/src/nvisii/entity.cpp
+++ b/src/nvisii/entity.cpp
@@ -253,7 +253,14 @@ Mesh* Entity::getMesh()
 	return &mesh;
 }
 
-void Entity::setVisibility(bool camera)
+void Entity::setVisibility(
+	bool camera, 
+	bool diffuse, 
+	bool glossy, 
+	bool transmission, 
+	bool volume_scatter, 
+	bool shadow
+)
 {
 	std::lock_guard<std::recursive_mutex> lock(*Entity::getEditMutex().get());
 	
@@ -263,6 +270,36 @@ void Entity::setVisibility(bool camera)
 	} else {
 		entity.flags &= (~ENTITY_VISIBILITY_CAMERA_RAYS);
 	}
+
+	if (diffuse) {
+		entity.flags |= ENTITY_VISIBILITY_DIFFUSE_RAYS;
+	} else {
+		entity.flags &= (~ENTITY_VISIBILITY_DIFFUSE_RAYS);
+	}
+
+	if (glossy) {
+		entity.flags |= ENTITY_VISIBILITY_GLOSSY_RAYS;
+	} else {
+		entity.flags &= (~ENTITY_VISIBILITY_GLOSSY_RAYS);
+	}
+
+	if (transmission) {
+		entity.flags |= ENTITY_VISIBILITY_TRANSMISSION_RAYS;
+	} else {
+		entity.flags &= (~ENTITY_VISIBILITY_TRANSMISSION_RAYS);
+	}
+
+	if (volume_scatter) {
+		entity.flags |= ENTITY_VISIBILITY_VOLUME_SCATTER_RAYS;
+	} else {
+		entity.flags &= (~ENTITY_VISIBILITY_VOLUME_SCATTER_RAYS);
+	}
+
+	if (shadow) {
+		entity.flags |= ENTITY_VISIBILITY_SHADOW_RAYS;
+	} else {
+		entity.flags &= (~ENTITY_VISIBILITY_SHADOW_RAYS);
+	}
 	markDirty();
 }
 

From 5a68fdf48f0daaa87aba11b7d719cc7c8cfdf8e5 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 3 Apr 2021 00:54:38 -0600
Subject: [PATCH 11/55] assigning visibility masks to entities

---
 src/nvisii/nvisii.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index fa576ccf..2e0bfb1f 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1290,12 +1290,11 @@ void updateComponents()
 
             OD.volumeHandles[v->getAddress()] = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint8_t), gridHdlPtr.get()->size(), nullptr);
             owlBufferUpload(OD.volumeHandles[v->getAddress()], gridHdlPtr.get()->data());
-            printf("%hhx\n",gridHdlPtr.get()->data()[0]);
+            // printf("%hhx\n",gridHdlPtr.get()->data()[0]);
             const void* d_gridData = owlBufferGetPointer(OD.volumeHandles[v->getAddress()], 0);
             uint8_t first_byte;
             cudaMemcpy((void*)&first_byte, d_gridData, 1, cudaMemcpyDeviceToHost);
-            printf("%hhx\n",first_byte);
-
+            // printf("%hhx\n",first_byte);
 
             // Create geometry and build BLAS
             uint32_t volumeID = v->getAddress();
@@ -1321,12 +1320,14 @@ void updateComponents()
         std::vector<OWLGroup> surfaceInstances;
         std::vector<glm::mat4> t0SurfaceTransforms;
         std::vector<glm::mat4> t1SurfaceTransforms;
+        std::vector<uint8_t> surfaceMasks;
         std::vector<uint32_t> surfaceInstanceToEntity;
         
         // Volume instances
         std::vector<OWLGroup> volumeInstances;
         std::vector<glm::mat4> t0VolumeTransforms;
         std::vector<glm::mat4> t1VolumeTransforms;
+        std::vector<uint8_t> volumeMasks;
         std::vector<uint32_t> volumeInstanceToEntity;
 
         // Todo: curves...
@@ -1362,6 +1363,7 @@ void updateComponents()
                 surfaceInstanceToEntity.push_back(eid);
                 t0SurfaceTransforms.push_back(prevLocalToWorld);
                 t1SurfaceTransforms.push_back(localToWorld);
+                surfaceMasks.push_back(entities[eid].getStruct().flags);
             }
             
             // Add any instanced volume geometry to the list
@@ -1376,11 +1378,14 @@ void updateComponents()
                 volumeInstanceToEntity.push_back(eid);
                 t0VolumeTransforms.push_back(prevLocalToWorld);
                 t1VolumeTransforms.push_back(localToWorld);
+                volumeMasks.push_back(entities[eid].getStruct().flags);
             }     
         }
 
+        std::vector<uint8_t>     owlSurfaceVisibilityMasks;
         std::vector<owl4x3f>     t0OwlSurfaceTransforms;
         std::vector<owl4x3f>     t1OwlSurfaceTransforms;
+        std::vector<uint8_t>     owlVolumeVisibilityMasks;
         std::vector<owl4x3f>     t0OwlVolumeTransforms;
         std::vector<owl4x3f>     t1OwlVolumeTransforms;
         auto oldSurfaceIAS = OD.surfacesIAS;
@@ -1409,9 +1414,11 @@ void updateComponents()
                 instanceGroupSetChild(OD.surfacesIAS, iid, surfaceInstances[iid]);                 
                 t0OwlSurfaceTransforms.push_back(glmToOWL(t0SurfaceTransforms[iid]));
                 t1OwlSurfaceTransforms.push_back(glmToOWL(t1SurfaceTransforms[iid]));
+                owlSurfaceVisibilityMasks.push_back(surfaceMasks[iid]);
             }            
             owlInstanceGroupSetTransforms(OD.surfacesIAS,0,(const float*)t0OwlSurfaceTransforms.data());
             owlInstanceGroupSetTransforms(OD.surfacesIAS,1,(const float*)t1OwlSurfaceTransforms.data());
+            owlInstanceGroupSetVisibilityMasks(OD.surfacesIAS, owlSurfaceVisibilityMasks.data());
             owlBufferResize(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.size());
             owlBufferUpload(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.data());
         }       
@@ -1423,9 +1430,11 @@ void updateComponents()
                 instanceGroupSetChild(OD.volumesIAS, iid, volumeInstances[iid]);                 
                 t0OwlVolumeTransforms.push_back(glmToOWL(t0VolumeTransforms[iid]));
                 t1OwlVolumeTransforms.push_back(glmToOWL(t1VolumeTransforms[iid]));
+                owlVolumeVisibilityMasks.push_back(volumeMasks[iid]);
             }            
             owlInstanceGroupSetTransforms(OD.volumesIAS,0,(const float*)t0OwlVolumeTransforms.data());
             owlInstanceGroupSetTransforms(OD.volumesIAS,1,(const float*)t1OwlVolumeTransforms.data());
+            owlInstanceGroupSetVisibilityMasks(OD.volumesIAS, owlVolumeVisibilityMasks.data());
             owlBufferResize(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.size());
             owlBufferUpload(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.data());
         }

From 8db142e5b8b39741b2240c8853073098f23ebf53 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Tue, 6 Apr 2021 13:02:02 -0600
Subject: [PATCH 12/55] forcing denoiser configure to wait

---
 src/nvisii/nvisii.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 2e0bfb1f..012c8dd8 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1946,7 +1946,7 @@ void configureDenoiser(bool useAlbedoGuide, bool useNormalGuide, bool useKernelP
             "If normal guide is enabled, albedo guide must also be enabled.");
     }
 
-    enqueueCommand([useAlbedoGuide, useNormalGuide, useKernelPrediction](){
+    enqueueCommandAndWait([useAlbedoGuide, useNormalGuide, useKernelPrediction](){
         OptixData.enableAlbedoGuide = useAlbedoGuide;
         OptixData.enableNormalGuide = useNormalGuide;
         #ifdef USE_OPTIX70

From 96c0aeee3d3314bb79f51b002d8d3ebe2b722f69 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 14 Apr 2021 10:37:53 -0600
Subject: [PATCH 13/55] fixing bug where renderToImage wasnt saving the correct
 buffer

---
 src/nvisii/nvisii.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 012c8dd8..8a7da290 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -2029,7 +2029,7 @@ std::vector<float> readFrameBuffer() {
         int num_devices = getDeviceCount();
         synchronizeDevices();
 
-        const glm::vec4 *fb = (const glm::vec4*)owlBufferGetPointer(OptixData.frameBuffer,0);
+        const glm::vec4 *fb = (const glm::vec4*)owlBufferGetPointer(OptixData.combinedFrameBuffer,0);
         for (uint32_t test = 0; test < frameBuffer.size(); test += 4) {
             frameBuffer[test + 0] = fb[test / 4].r;
             frameBuffer[test + 1] = fb[test / 4].g;
@@ -2046,9 +2046,6 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
     if ((width < 1) || (height < 1)) throw std::runtime_error("Error, invalid width/height");
     std::vector<float> frameBuffer(width * height * 4);
 
-    // flush command queue
-    enqueueCommandAndWait([](){});
-
     enqueueCommandAndWait([&frameBuffer, width, height, samplesPerPixel, seed] () {
         if (!NVISII.headlessMode) {
             if ((width != WindowData.currentSize.x) || (height != WindowData.currentSize.y))
@@ -2122,7 +2119,7 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
         }
 
         synchronizeDevices();
-        const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
+        const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.combinedFrameBuffer,0);
         cudaMemcpyAsync(frameBuffer.data(), fb, width * height * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
     });
 
@@ -2271,7 +2268,7 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
 
         synchronizeDevices();
 
-        const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
+        const glm::vec4 *fb = (const glm::vec4*) owlBufferGetPointer(OptixData.combinedFrameBuffer,0);
         cudaMemcpyAsync(frameBuffer.data(), fb, width * height * sizeof(glm::vec4), cudaMemcpyDeviceToHost);
 
         OptixData.LP.renderDataMode = 0;

From 8ccbf36bbd239771473f12819c404a89be3d06ff Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 14 Apr 2021 10:59:30 -0600
Subject: [PATCH 14/55] working on a volume sample

---
 examples/download_content.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/download_content.sh b/examples/download_content.sh
index 02b0fe46..92f416b3 100644
--- a/examples/download_content.sh
+++ b/examples/download_content.sh
@@ -23,4 +23,7 @@ wget https://www.dropbox.com/s/22bug1he354oqpt/bmw.zip
 unzip bmw.zip -d bmw/ 
 rm bmw.zip
 
-wget https://www.dropbox.com/s/76gumyy7j0f3cyj/dragon.stl
\ No newline at end of file
+wget https://www.dropbox.com/s/76gumyy7j0f3cyj/dragon.stl
+
+wget https://www.dropbox.com/s/runlp60bjjf3dpu/bunny_cloud.zip
+unzip bunny_cloud.zip

From 5746cb6fb8c1c512988abbb2cedfb68c237d2bfc Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 14 Apr 2021 11:25:55 -0600
Subject: [PATCH 15/55] simplifying download_content script a bit

---
 examples/content.txt         | 11 +++++++++++
 examples/download_content.sh | 22 ++++------------------
 2 files changed, 15 insertions(+), 18 deletions(-)
 create mode 100644 examples/content.txt

diff --git a/examples/content.txt b/examples/content.txt
new file mode 100644
index 00000000..1b7882e6
--- /dev/null
+++ b/examples/content.txt
@@ -0,0 +1,11 @@
+https://www.dropbox.com/s/jve877nanizw2vf/dragon.zip
+https://www.dropbox.com/s/jh3o6wtqdrq4bi2/photos_2020_5_11_fst_gray-wall-grunge.jpg
+https://www.dropbox.com/s/gb67d0cv1lgrgdp/kiara_4_mid-morning_4k.hdr
+https://www.dropbox.com/s/8nj82vxvxwvnttt/salle_de_bain_separated.zip
+https://www.dropbox.com/s/p2xius4kd4olqg3/gradient.png
+https://www.dropbox.com/s/bxbkzmuy2mviyzb/Bricks051_2K-JPG.zip
+https://www.dropbox.com/s/na3vo8rca7feoiq/teatro_massimo_2k.hdr
+https://www.dropbox.com/s/22bug1he354oqpt/bmw.zip
+https://www.dropbox.com/s/76gumyy7j0f3cyj/dragon.stl
+https://www.dropbox.com/s/runlp60bjjf3dpu/bunny_cloud.zip
+https://www.dropbox.com/s/nim7jsjiumei4f9/boston_teapot_256x256x178_uint8.zip
diff --git a/examples/download_content.sh b/examples/download_content.sh
index 92f416b3..c6ca856f 100644
--- a/examples/download_content.sh
+++ b/examples/download_content.sh
@@ -1,29 +1,15 @@
 mkdir content
 cd content
-wget https://www.dropbox.com/s/jve877nanizw2vf/dragon.zip
-unzip dragon.zip -d dragon/
-rm dragon.zip
-
-wget https://www.dropbox.com/s/jh3o6wtqdrq4bi2/photos_2020_5_11_fst_gray-wall-grunge.jpg
 
-wget https://www.dropbox.com/s/gb67d0cv1lgrgdp/kiara_4_mid-morning_4k.hdr
+wget -i ../content.txt
 
-wget https://www.dropbox.com/s/8nj82vxvxwvnttt/salle_de_bain_separated.zip
+unzip dragon.zip -d dragon/
+rm dragon.zip
 unzip salle_de_bain_separated.zip
 rm salle_de_bain_separated.zip
-
-wget https://www.dropbox.com/s/p2xius4kd4olqg3/gradient.png
-wget https://www.dropbox.com/s/bxbkzmuy2mviyzb/Bricks051_2K-JPG.zip
 unzip Bricks051_2K-JPG.zip
-
-wget https://www.dropbox.com/s/na3vo8rca7feoiq/teatro_massimo_2k.hdr
-
 mkdir bmw 
-wget https://www.dropbox.com/s/22bug1he354oqpt/bmw.zip
 unzip bmw.zip -d bmw/ 
 rm bmw.zip
-
-wget https://www.dropbox.com/s/76gumyy7j0f3cyj/dragon.stl
-
-wget https://www.dropbox.com/s/runlp60bjjf3dpu/bunny_cloud.zip
 unzip bunny_cloud.zip
+unzip boston_teapot_256x256x178_uint8.zip

From 04f2bb25c3b016abae1336af1ad1b85b53d94910 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 14 Apr 2021 12:42:12 -0600
Subject: [PATCH 16/55] adding a volumes example. Some bugs need to be worked
 out

---
 examples/22.volumes.py | 146 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 examples/22.volumes.py

diff --git a/examples/22.volumes.py b/examples/22.volumes.py
new file mode 100644
index 00000000..9a74eb14
--- /dev/null
+++ b/examples/22.volumes.py
@@ -0,0 +1,146 @@
+#%%
+
+# 22.volumes.py
+#
+# This shows an example of two volumes. One volume uses the NanoVDB format,
+# and the other is a raw volume.
+
+import nvisii
+import numpy as np
+opt = lambda: None
+opt.spp = 50 
+opt.width = 512
+opt.height = 512 
+opt.out = '22_volumes.png'
+
+# headless - no window
+# verbose - output number of frames rendered, etc..
+nvisii.initialize(headless = False, verbose = True, window_on_top = True)
+
+#%%
+# Use a neural network to denoise ray traced
+nvisii.enable_denoiser()
+
+# First, lets create an entity that will serve as our camera.
+camera = nvisii.entity.create(name = "camera")
+
+# To place the camera into our scene, we'll add a "transform" component.
+# (All nvisii objects have a "name" that can be used for easy lookup later.)
+camera.set_transform(nvisii.transform.create(name = "camera_transform"))
+
+# To make our camera entity act like a "camera", we'll add a camera component
+camera.set_camera(
+    nvisii.camera.create_from_fov(
+        name = "camera_camera", 
+        field_of_view = 0.785398, # note, this is in radians
+        aspect = opt.width / float(opt.height)
+    )
+)
+
+# Finally, we'll select this entity to be the current camera entity.
+# (nvisii can only use one camera at the time)
+nvisii.set_camera_entity(camera)
+
+# Lets set the camera to look at an object. 
+# We'll do this by editing the transform component.
+camera.get_transform().look_at(at = (0, 0, .9), up = (0, 0, 1), eye = (0, 5, 1))
+
+# Next, lets at an object (a floor).
+floor = nvisii.entity.create(
+    name = "floor",
+    mesh = nvisii.mesh.create_plane("mesh_floor"),
+    transform = nvisii.transform.create("transform_floor"),
+    material = nvisii.material.create("material_floor")
+)
+
+# Lets make our floor act as a mirror
+mat = floor.get_material()
+# mat = nvisii.material.get("material_floor") # <- this also works
+#%%
+# Mirrors are smooth and "metallic".
+mat.set_base_color((1.,1.,1.)) 
+mat.set_metallic(0) 
+mat.set_roughness(1)
+
+# Make the floor large by scaling it
+trans = floor.get_transform()
+trans.set_scale((5,5,1))
+
+#%%
+# Let's also add a sphere
+torus = nvisii.entity.create(
+    name="torus",
+    volume = nvisii.volume.create_torus("torus"),
+    transform = nvisii.transform.create("torus"),
+    material = nvisii.material.create("torus")
+)
+#%%
+torus.get_transform().set_position((.5,2,0.35))
+torus.get_transform().set_scale((0.003, 0.003, 0.003))
+torus.get_transform().set_angle_axis(3.14 * .25, (1,0,0))
+torus.get_material().set_base_color((.0,0.0,1))  
+torus.get_material().set_roughness(0.0)
+torus.get_material().set_transmission(0.0)
+torus.get_volume().set_gradient_factor(10)
+torus.get_volume().set_absorption(0)
+torus.get_volume().set_scattering(1)
+torus.get_volume().set_scale(50)
+
+#%%
+# Let's also add a bunny
+bunny = nvisii.entity.create(
+    name="bunny",
+    volume = nvisii.volume.create_from_file("bunny", "./content/bunny_cloud.nvdb"),
+    transform = nvisii.transform.create("bunny"),
+    material = nvisii.material.create("bunny")
+)
+#%%
+bunny.get_transform().set_position((-1,0,0.75))
+bunny.get_transform().set_scale((0.003, 0.003, 0.003))
+bunny.get_material().set_base_color((0.1,0.9,0.08))  
+bunny.get_material().set_roughness(0.7)   
+bunny.get_volume().set_gradient_factor(10)
+bunny.get_volume().set_absorption(0)
+bunny.get_volume().set_scattering(1)
+bunny.get_volume().set_scale(10)
+bunny.get_transform().set_angle_axis(nvisii.pi() * .5, (1,0,0))
+bunny.get_transform().add_angle_axis(nvisii.pi(), (0,1,0))
+
+#%%
+voxels = np.fromfile("./content/boston_teapot_256x256x178_uint8.raw", dtype=np.uint8).astype(np.float32)
+
+
+
+#%%
+# Let's also add a teapot
+teapot = nvisii.entity.create(
+    name="teapot",
+    volume = nvisii.volume.create_from_data("teapot", width = 256, height = 256, depth = 178, data = voxels, background = 0.0),
+    transform = nvisii.transform.create("teapot"),
+    material = nvisii.material.create("teapot")
+)
+#%%
+teapot.get_transform().set_position((1,0,0.7))
+teapot.get_transform().set_scale((0.005, 0.005, 0.005))
+teapot.get_material().set_base_color((1.0,0.0,0.0))  
+teapot.get_material().set_roughness(0.0)
+teapot.get_material().set_metallic(1.0)
+teapot.get_volume().set_gradient_factor(100)
+teapot.get_volume().set_absorption(1)
+teapot.get_volume().set_scattering(0)
+teapot.get_volume().set_scale(250)
+teapot.get_transform().set_angle_axis(-nvisii.pi() * .5, (1,0,0))
+teapot.get_transform().add_angle_axis(nvisii.pi() * 1.1, (0,1,0))
+
+#%%
+#%%
+# Now that we have a simple scene, let's render it 
+print("rendering to", "01_simple_scene.png")
+nvisii.render_to_file(
+    width = opt.width, 
+    height = opt.height, 
+    samples_per_pixel = opt.spp,   
+    file_path = "01_simple_scene.png"
+)
+
+nvisii.deinitialize()

From 53be1aabd8088d0767f59c1a6d3bd2fa036d81bc Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 14 Apr 2021 16:25:47 -0600
Subject: [PATCH 17/55] working on refactoring volume codepath to improve
 efficiency

---
 src/nvisii/devicecode/launch_params.h |   6 +-
 src/nvisii/devicecode/path_tracer.cu  |  32 ++--
 src/nvisii/nvisii.cpp                 | 242 +++++++-------------------
 3 files changed, 78 insertions(+), 202 deletions(-)

diff --git a/src/nvisii/devicecode/launch_params.h b/src/nvisii/devicecode/launch_params.h
index 717b2cf3..e6142f07 100644
--- a/src/nvisii/devicecode/launch_params.h
+++ b/src/nvisii/devicecode/launch_params.h
@@ -29,8 +29,7 @@ struct LaunchParams {
     glm::vec4 *scratchBuffer;
     glm::vec4 *mvecBuffer;
     glm::vec4 *accumPtr;
-    OptixTraversableHandle surfacesIAS;
-    OptixTraversableHandle volumesIAS;
+    OptixTraversableHandle IAS;
     float domeLightIntensity = 1.f;
     float domeLightExposure = 0.f;
     glm::vec3 domeLightColor = glm::vec3(-1.f);
@@ -60,8 +59,7 @@ struct LaunchParams {
     Buffer<TextureStruct> textures;
     Buffer<VolumeStruct> volumes;
     Buffer<uint32_t> lightEntities;
-    Buffer<uint32_t> surfaceInstanceToEntity;
-    Buffer<uint32_t> volumeInstanceToEntity;
+    Buffer<uint32_t> instanceToEntity;
     uint32_t         numLightEntities = 0;
 
     Buffer<Buffer<float3>> vertexLists;
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index bd438288..e292428b 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -967,7 +967,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     RayPayload surfPayload;
     surfPayload.tHit = -1.f;
     surfRay.time = time;    
-    owl::traceRay(  /*accel to trace against*/ LP.surfacesIAS,
+    owl::traceRay(  /*accel to trace against*/ LP.IAS,
                     /*the ray to trace*/ surfRay,
                     /*prd*/ surfPayload,
                     OPTIX_RAY_FLAG_DISABLE_ANYHIT);
@@ -980,7 +980,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     volPayload.t0 = volRay.tmin;
     volPayload.t1 = volRay.tmax;
     volPayload.primitiveID = (debug) ? -2 : -1;
-    owl::traceRay(  /*accel to trace against*/ LP.volumesIAS,
+    owl::traceRay(  /*accel to trace against*/ LP.IAS,
                     /*the ray to trace*/ volRay,
                     /*prd*/ volPayload,
                     OPTIX_RAY_FLAG_DISABLE_ANYHIT);
@@ -1020,8 +1020,8 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
 
         // Load the object we hit.
         int entityID;
-        if (isVolume) { GET(entityID, int, LP.volumeInstanceToEntity, volPayload.instanceID); }
-        else { GET(entityID, int, LP.surfaceInstanceToEntity, surfPayload.instanceID); }
+        if (isVolume) { GET(entityID, int, LP.instanceToEntity, volPayload.instanceID); }
+        else { GET(entityID, int, LP.instanceToEntity, surfPayload.instanceID); }
 
         GET(EntityStruct entity, EntityStruct, LP.entities, entityID);
         GET(TransformStruct transform, TransformStruct, LP.transforms, entity.transform_id);
@@ -1035,7 +1035,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             surfRay.origin = surfRay.origin + surfRay.direction * (surfPayload.tHit + EPSILON);
             surfPayload.tHit = -1.f;
             surfRay.time = time;
-            owl::traceRay( LP.surfacesIAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+            owl::traceRay( LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
             volRay = surfRay;
             volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
@@ -1044,7 +1044,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             volPayload.t0 = volRay.tmin;
             volPayload.t1 = volRay.tmax;
             volPayload.primitiveID = (debug) ? -3 : -1;
-            owl::traceRay( LP.volumesIAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+            owl::traceRay( LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
             transparencyDepth++;
             if (transparencyDepth > LP.maxTransparencyDepth) break;
             continue;
@@ -1190,7 +1190,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
                 surfRay.origin = surfRay.origin + surfRay.direction * (surfPayload.tHit + EPSILON);
                 surfPayload.tHit = -1.f;
                 surfRay.time = time;
-                owl::traceRay( LP.surfacesIAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+                owl::traceRay( LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
                 volRay = surfRay;
                 volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
@@ -1199,7 +1199,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
                 volPayload.t0 = volRay.tmin;
                 volPayload.t1 = volRay.tmax;
                 volPayload.primitiveID = (debug) ? -4 : -1;
-                owl::traceRay( LP.volumesIAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+                owl::traceRay( LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
                 
                 ++depth;     
                 transparencyDepth++;
@@ -1289,7 +1289,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             surfRay.origin = surfRay.origin + surfRay.direction * (surfPayload.tHit + EPSILON);
             surfPayload.tHit = -1.f;
             surfRay.time = time;
-            owl::traceRay( LP.surfacesIAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+            owl::traceRay( LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
             volRay = surfRay;
             volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
@@ -1298,7 +1298,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             volPayload.t0 = volRay.tmin;
             volPayload.t1 = volRay.tmax;
             volPayload.primitiveID = (debug) ? -4 : -1;
-            owl::traceRay( LP.volumesIAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+            owl::traceRay( LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
             
             // Count this as a "transparent" bounce.
             ++depth;     
@@ -1432,13 +1432,13 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             ray.tmin = EPSILON * 10.f; ray.tmax = lightDistance + EPSILON; // needs to be distance to light, else anyhit logic breaks.
             ray.origin = hit_p; ray.direction = lightDir;
             ray.time = time;
-            owl::traceRay( LP.surfacesIAS, ray, surfPayload, occlusion_flags);
+            owl::traceRay( LP.IAS, ray, surfPayload, occlusion_flags);
             ray.tmax = (surfPayload.instanceID == -2) ? ray.tmax : surfPayload.tHit;
             volPayload.rng = rng;
             volPayload.t0 = volRay.tmin;
             volPayload.t1 = volRay.tmax;
             volPayload.primitiveID = (debug) ? -5 : -1;
-            owl::traceRay( LP.volumesIAS, ray, volPayload, occlusion_flags);
+            owl::traceRay( LP.IAS, ray, volPayload, occlusion_flags);
             bool visible;
             if (randomID == numLights) {
                 //  If we sampled the dome light, just check to see if we hit anything
@@ -1447,7 +1447,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
                 // If we sampled a light source, then check to see if we hit something other than the light
                 int surfEntity;
                 if (surfPayload.instanceID == -2) surfEntity = -1;
-                else { GET(surfEntity, int, LP.surfaceInstanceToEntity, surfPayload.instanceID); }
+                else { GET(surfEntity, int, LP.instanceToEntity, surfPayload.instanceID); }
                 visible = (volPayload.instanceID == -2) && (surfPayload.instanceID == -2 || surfEntity == sampledLightID);
             }
             if (visible) {
@@ -1475,7 +1475,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         surfPayload.instanceID = -1;
         surfPayload.tHit = -1.f;
         surfRay.time = sampleTime(lcg_randomf(rng));
-        owl::traceRay(LP.surfacesIAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+        owl::traceRay(LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
         volRay = surfRay;
         volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
@@ -1483,7 +1483,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         volPayload.t0 = volRay.tmin;
         volPayload.t1 = volRay.tmax;
         volPayload.primitiveID = (debug) ? -6 : -1;
-        owl::traceRay(LP.volumesIAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+        owl::traceRay(LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
         // Check if we hit any of the previously sampled lights
         bool hitLight = false;
@@ -1506,7 +1506,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             // else if by sampling the brdf we also hit an area light
             // TODO: consider hitting emissive voxels?
             else if (surfPayload.instanceID != -1 && volPayload.instanceID == -1) {
-                GET(int entityID, int, LP.surfaceInstanceToEntity, surfPayload.instanceID);
+                GET(int entityID, int, LP.instanceToEntity, surfPayload.instanceID);
                 bool visible = (entityID == sampledLightID);
                 // We hit the light we sampled previously
                 if (visible) {
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 8a7da290..5aea8f48 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -112,8 +112,7 @@ static struct OptixData {
     OWLBuffer textureBuffer;
     OWLBuffer volumeBuffer;
     OWLBuffer lightEntitiesBuffer;
-    OWLBuffer surfaceInstanceToEntityBuffer;
-    OWLBuffer volumeInstanceToEntityBuffer;
+    OWLBuffer instanceToEntityBuffer;
     OWLBuffer vertexListsBuffer;
     OWLBuffer normalListsBuffer;
     OWLBuffer tangentListsBuffer;
@@ -144,8 +143,7 @@ static struct OptixData {
     std::vector<OWLGeom> volumeGeomList;
     std::vector<OWLGroup> volumeBlasList;
 
-    OWLGroup surfacesIAS = nullptr;
-    OWLGroup volumesIAS = nullptr;
+    OWLGroup IAS = nullptr;
 
     std::vector<uint32_t> lightEntities;
 
@@ -521,8 +519,7 @@ void initializeOptix(bool headless)
         { "scratchBuffer",           OWL_BUFPTR,                        OWL_OFFSETOF(LaunchParams, scratchBuffer)},
         { "mvecBuffer",              OWL_BUFPTR,                        OWL_OFFSETOF(LaunchParams, mvecBuffer)},
         { "accumPtr",                OWL_BUFPTR,                        OWL_OFFSETOF(LaunchParams, accumPtr)},
-        { "surfacesIAS",             OWL_GROUP,                         OWL_OFFSETOF(LaunchParams, surfacesIAS)},
-        { "volumesIAS",              OWL_GROUP,                         OWL_OFFSETOF(LaunchParams, volumesIAS)},
+        { "IAS",                     OWL_GROUP,                         OWL_OFFSETOF(LaunchParams, IAS)},
         { "cameraEntity",            OWL_USER_TYPE(EntityStruct),       OWL_OFFSETOF(LaunchParams, cameraEntity)},
         { "entities",                OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, entities)},
         { "transforms",              OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, transforms)},
@@ -539,8 +536,7 @@ void initializeOptix(bool headless)
         { "texCoordLists",           OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, texCoordLists)},
         { "indexLists",              OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, indexLists)},
         { "numLightEntities",        OWL_USER_TYPE(uint32_t),           OWL_OFFSETOF(LaunchParams, numLightEntities)},
-        { "surfaceInstanceToEntity", OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, surfaceInstanceToEntity)},
-        { "volumeInstanceToEntity",  OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, volumeInstanceToEntity)},
+        { "instanceToEntity",        OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, instanceToEntity)},
         { "domeLightIntensity",      OWL_USER_TYPE(float),              OWL_OFFSETOF(LaunchParams, domeLightIntensity)},
         { "domeLightExposure",       OWL_USER_TYPE(float),              OWL_OFFSETOF(LaunchParams, domeLightExposure)},
         { "domeLightColor",          OWL_USER_TYPE(glm::vec3),          OWL_OFFSETOF(LaunchParams, domeLightColor)},
@@ -632,8 +628,7 @@ void initializeOptix(bool headless)
     OD.volumeBuffer              = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(VolumeStruct),        Volume::getCount(),   nullptr);
     OD.volumeHandlesBuffer       = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Volume::getCount(),   nullptr);
     OD.lightEntitiesBuffer       = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
-    OD.surfaceInstanceToEntityBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
-    OD.volumeInstanceToEntityBuffer = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
+    OD.instanceToEntityBuffer    = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint32_t),            1,              nullptr);
     OD.vertexListsBuffer         = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
     OD.normalListsBuffer         = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
     OD.tangentListsBuffer        = owlDeviceBufferCreate(OD.context, OWL_BUFFER,                         Mesh::getCount(),     nullptr);
@@ -650,15 +645,14 @@ void initializeOptix(bool headless)
     owlParamsSetBuffer(OD.launchParams, "textures",             OD.textureBuffer);
     owlParamsSetBuffer(OD.launchParams, "volumes",              OD.volumeBuffer);
     owlParamsSetBuffer(OD.launchParams, "lightEntities",        OD.lightEntitiesBuffer);
-    owlParamsSetBuffer(OD.launchParams, "surfaceInstanceToEntity",  OD.surfaceInstanceToEntityBuffer);
-    owlParamsSetBuffer(OD.launchParams, "volumeInstanceToEntity",  OD.volumeInstanceToEntityBuffer);
+    owlParamsSetBuffer(OD.launchParams, "instanceToEntity",     OD.instanceToEntityBuffer);
     owlParamsSetBuffer(OD.launchParams, "vertexLists",          OD.vertexListsBuffer);
     owlParamsSetBuffer(OD.launchParams, "normalLists",          OD.normalListsBuffer);
-    owlParamsSetBuffer(OD.launchParams, "tangentLists",          OD.tangentListsBuffer);
+    owlParamsSetBuffer(OD.launchParams, "tangentLists",         OD.tangentListsBuffer);
     owlParamsSetBuffer(OD.launchParams, "texCoordLists",        OD.texCoordListsBuffer);
     owlParamsSetBuffer(OD.launchParams, "indexLists",           OD.indexListsBuffer);
     owlParamsSetBuffer(OD.launchParams, "textureObjects",       OD.textureObjectsBuffer);
-    owlParamsSetBuffer(OD.launchParams, "volumeHandles",       OD.volumeHandlesBuffer);
+    owlParamsSetBuffer(OD.launchParams, "volumeHandles",        OD.volumeHandlesBuffer);
 
     uint32_t meshCount = Mesh::getCount();
     OD.vertexLists.resize(meshCount);
@@ -689,24 +683,7 @@ void initializeOptix(bool headless)
     owlParamsSetBuffer(OD.launchParams, "environmentMapCols", OD.environmentMapColsBuffer);
     owlParamsSetRaw(OD.launchParams, "environmentMapWidth", &OD.LP.environmentMapWidth);
     owlParamsSetRaw(OD.launchParams, "environmentMapHeight", &OD.LP.environmentMapHeight);
-
-    // OWLTexture GGX_E_AVG_LOOKUP = owlTexture2DCreate(OD.context,
-    //                         OWL_TEXEL_FORMAT_R32F,
-    //                         GGX_E_avg_size,1,
-    //                         GGX_E_avg,
-    //                         OWL_TEXTURE_LINEAR,
-    //                         OWL_COLOR_SPACE_LINEAR,
-    //                         OWL_TEXTURE_CLAMP);
-    // OWLTexture GGX_E_LOOKUP = owlTexture2DCreate(OD.context,
-    //                         OWL_TEXEL_FORMAT_R32F,
-    //                         GGX_E_size[0],GGX_E_size[1],
-    //                         GGX_E,
-    //                         OWL_TEXTURE_LINEAR,
-    //                         OWL_TEXTURE_CLAMP,
-    //                         OWL_COLOR_SPACE_LINEAR);
-    // launchParamsSetTexture(OD.launchParams, "GGX_E_AVG_LOOKUP", GGX_E_AVG_LOOKUP);
-    // launchParamsSetTexture(OD.launchParams, "GGX_E_LOOKUP",     GGX_E_LOOKUP);
-    
+   
     OD.LP.numLightEntities = uint32_t(OD.lightEntities.size());
     owlParamsSetRaw(OD.launchParams, "numLightEntities", &OD.LP.numLightEntities);
     owlParamsSetRaw(OD.launchParams, "domeLightIntensity", &OD.LP.domeLightIntensity);
@@ -770,10 +747,10 @@ void initializeOptix(bool headless)
     groupBuildAccel(OD.placeholderGroup);
 
     // build IAS
-    OWLGroup surfacesIAS = instanceGroupCreate(OD.context, 1);
-    instanceGroupSetChild(surfacesIAS, 0, OD.placeholderGroup); 
-    groupBuildAccel(surfacesIAS);
-    owlParamsSetGroup(OD.launchParams, "surfacesIAS", surfacesIAS);
+    OWLGroup IAS = instanceGroupCreate(OD.context, 1);
+    instanceGroupSetChild(IAS, 0, OD.placeholderGroup); 
+    groupBuildAccel(IAS);
+    owlParamsSetGroup(OD.launchParams, "IAS", IAS);
 
     OWLGeom userGeom = owlGeomCreate(OD.context, OD.volumeGeomType);
     owlGeomSetPrimCount(userGeom, 1);
@@ -783,11 +760,6 @@ void initializeOptix(bool headless)
     OD.placeholderUserGroup = owlUserGeomGroupCreate(OD.context, 1, &userGeom);
     groupBuildAccel(OD.placeholderUserGroup);
 
-    OWLGroup volumesIAS = instanceGroupCreate(OD.context, 1);
-    instanceGroupSetChild(volumesIAS, 0, OD.placeholderUserGroup); 
-    groupBuildAccel(volumesIAS);
-    owlParamsSetGroup(OD.launchParams, "volumesIAS", volumesIAS);
-
     // Build *SBT* required to trace the groups   
     owlBuildPipeline(OD.context);
     owlBuildSBT(OD.context);
@@ -1012,36 +984,6 @@ void setDomeLightSky(vec3 sunPos, vec3 skyTint, float atmosphereThickness, float
         OptixData.proceduralSkyTexture = owlTexture2DCreate(OptixData.context, OWL_TEXEL_FORMAT_RGBA32F, width, height, texels.data());
         owlParamsSetTexture(OptixData.launchParams, "proceduralSkyTexture", OptixData.proceduralSkyTexture);
 
-        // float invWidth = 1.f / float(width);
-        // float invHeight = 1.f / float(height);
-        // float invjacobian = width * height / float(4 * M_PI);
-
-        // auto rows = std::vector<float>(height);
-        // auto cols = std::vector<float>(width * height);
-        // for (int y = 0, i = 0; y < height; y++) {
-        //     for (int x = 0; x < width; x++, i++) {
-        //         cols[i] = std::max(texels[i].r, std::max(texels[i].g, texels[i].b)) + ((x > 0) ? cols[i - 1] : 0.f);
-        //     }
-        //     rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
-        //     // normalize the pdf for this scanline (if it was non-zero)
-        //     if (cols[i - 1] > 0) {
-        //         for (int x = 0; x < width; x++) {
-        //             cols[i - width + x] /= cols[i - 1];
-        //         }
-        //     }
-        // }
-
-        // // normalize the pdf across all scanlines
-        // for (int y = 0; y < height; y++)
-        //     rows[y] /= rows[height - 1];
-        
-        // if (OptixData.environmentMapRowsBuffer) owlBufferRelease(OptixData.environmentMapRowsBuffer);
-        // if (OptixData.environmentMapColsBuffer) owlBufferRelease(OptixData.environmentMapColsBuffer);
-        // OptixData.environmentMapRowsBuffer = owlDeviceBufferCreate(OptixData.context, OWL_USER_TYPE(float), height, rows.data());
-        // OptixData.environmentMapColsBuffer = owlDeviceBufferCreate(OptixData.context, OWL_USER_TYPE(float), width * height, cols.data());
-        // OptixData.LP.environmentMapWidth = width;
-        // OptixData.LP.environmentMapHeight = height;  
-
         OptixData.LP.environmentMapWidth = 0;
         OptixData.LP.environmentMapHeight = 0;  
         resetAccumulation();
@@ -1316,21 +1258,11 @@ void updateComponents()
     // Manage Entities: Build / Rebuild TLAS
     auto dirtyEntities = Entity::getDirtyEntities();
     if (dirtyEntities.size() > 0) {
-        // Surface instances
-        std::vector<OWLGroup> surfaceInstances;
-        std::vector<glm::mat4> t0SurfaceTransforms;
-        std::vector<glm::mat4> t1SurfaceTransforms;
-        std::vector<uint8_t> surfaceMasks;
-        std::vector<uint32_t> surfaceInstanceToEntity;
-        
-        // Volume instances
-        std::vector<OWLGroup> volumeInstances;
-        std::vector<glm::mat4> t0VolumeTransforms;
-        std::vector<glm::mat4> t1VolumeTransforms;
-        std::vector<uint8_t> volumeMasks;
-        std::vector<uint32_t> volumeInstanceToEntity;
-
-        // Todo: curves...
+        std::vector<OWLGroup> instances;
+        std::vector<glm::mat4> t0Transforms;
+        std::vector<glm::mat4> t1Transforms;
+        std::vector<uint8_t> masks;
+        std::vector<uint32_t> instanceToEntity;
 
         // Aggregate instanced geometry and transformations 
         Entity* entities = Entity::getFront();
@@ -1349,6 +1281,14 @@ void updateComponents()
             // Get instance transformation
             glm::mat4 prevLocalToWorld = entities[eid].getTransform()->getLocalToWorldMatrix(/*previous = */true);
             glm::mat4 localToWorld = entities[eid].getTransform()->getLocalToWorldMatrix(/*previous = */false);
+            t0Transforms.push_back(prevLocalToWorld);
+            t1Transforms.push_back(localToWorld);
+
+            // Get instance mask
+            masks.push_back(entities[eid].getStruct().flags);
+
+            // Indirection from instance back to entity ID
+            instanceToEntity.push_back(eid);
 
             // Add any instanced mesh geometry to the list
             if (entities[eid].getMesh()) {
@@ -1359,98 +1299,63 @@ void updateComponents()
                     // Mark it as dirty. It should be available in a subsequent frame
                     entities[eid].getMesh()->markDirty(); return; 
                 }
-                surfaceInstances.push_back(blas);
-                surfaceInstanceToEntity.push_back(eid);
-                t0SurfaceTransforms.push_back(prevLocalToWorld);
-                t1SurfaceTransforms.push_back(localToWorld);
-                surfaceMasks.push_back(entities[eid].getStruct().flags);
+                instances.push_back(blas);
             }
             
             // Add any instanced volume geometry to the list
-            if (entities[eid].getVolume()) {
+            else if (entities[eid].getVolume()) {
                 uint32_t address = entities[eid].getVolume()->getAddress();
                 OWLGroup blas = OD.volumeBlasList[address];
                 if (!blas) {
                     // Same as meshes, if BLAS doesn't exist, force BLAS build and try again.
                     entities[eid].getMesh()->markDirty(); return; 
                 }
-                volumeInstances.push_back(blas);
-                volumeInstanceToEntity.push_back(eid);
-                t0VolumeTransforms.push_back(prevLocalToWorld);
-                t1VolumeTransforms.push_back(localToWorld);
-                volumeMasks.push_back(entities[eid].getStruct().flags);
-            }     
+                instances.push_back(blas);
+            } 
+            
+            else {
+                throw std::runtime_error("Internal Error, renderable entity has no mesh or volume components!?");
+            }   
         }
 
-        std::vector<uint8_t>     owlSurfaceVisibilityMasks;
-        std::vector<owl4x3f>     t0OwlSurfaceTransforms;
-        std::vector<owl4x3f>     t1OwlSurfaceTransforms;
-        std::vector<uint8_t>     owlVolumeVisibilityMasks;
-        std::vector<owl4x3f>     t0OwlVolumeTransforms;
-        std::vector<owl4x3f>     t1OwlVolumeTransforms;
-        auto oldSurfaceIAS = OD.surfacesIAS;
-        auto oldVolumeIAS = OD.volumesIAS;
+        std::vector<uint8_t>     owlVisibilityMasks;
+        std::vector<owl4x3f>     t0OwlTransforms;
+        std::vector<owl4x3f>     t1OwlTransforms;
+        auto oldIAS = OD.IAS;
         
-        // If no surfaces instanced, insert an unhittable placeholder.
+        // If no objects are instanced, insert an unhittable placeholder.
         // (required for certain older driver versions)
-        if (surfaceInstances.size() == 0) {
-            OD.surfacesIAS = instanceGroupCreate(OD.context, 1);
-            instanceGroupSetChild(OD.surfacesIAS, 0, OD.placeholderGroup); 
-            groupBuildAccel(OD.surfacesIAS);
+        if (instances.size() == 0) {
+            OD.IAS = instanceGroupCreate(OD.context, 1);
+            instanceGroupSetChild(OD.IAS, 0, OD.placeholderGroup); 
+            groupBuildAccel(OD.IAS);
         }
 
-        // If no volumes instanced, insert an unhittable placeholder.
-        // (required for certain older driver versions)
-        if (volumeInstances.size() == 0) {
-            OD.volumesIAS = instanceGroupCreate(OD.context, 1);
-            instanceGroupSetChild(OD.volumesIAS, 0, OD.placeholderUserGroup); 
-            groupBuildAccel(OD.volumesIAS);
-        }
-
-        // Set surface transforms to IAS, upload surface instance to entity map
-        if (surfaceInstances.size() > 0) {
-            OD.surfacesIAS = instanceGroupCreate(OD.context, surfaceInstances.size());
-            for (uint32_t iid = 0; iid < surfaceInstances.size(); ++iid) {
-                instanceGroupSetChild(OD.surfacesIAS, iid, surfaceInstances[iid]);                 
-                t0OwlSurfaceTransforms.push_back(glmToOWL(t0SurfaceTransforms[iid]));
-                t1OwlSurfaceTransforms.push_back(glmToOWL(t1SurfaceTransforms[iid]));
-                owlSurfaceVisibilityMasks.push_back(surfaceMasks[iid]);
+        // Set instance transforms and masks, upload instance to entity map
+        if (instances.size() > 0) {
+            OD.IAS = instanceGroupCreate(OD.context, instances.size());
+            for (uint32_t iid = 0; iid < instances.size(); ++iid) {
+                instanceGroupSetChild(OD.IAS, iid, instances[iid]);                 
+                t0OwlTransforms.push_back(glmToOWL(t0Transforms[iid]));
+                t1OwlTransforms.push_back(glmToOWL(t1Transforms[iid]));
+                owlVisibilityMasks.push_back(masks[iid]);
             }            
-            owlInstanceGroupSetTransforms(OD.surfacesIAS,0,(const float*)t0OwlSurfaceTransforms.data());
-            owlInstanceGroupSetTransforms(OD.surfacesIAS,1,(const float*)t1OwlSurfaceTransforms.data());
-            owlInstanceGroupSetVisibilityMasks(OD.surfacesIAS, owlSurfaceVisibilityMasks.data());
-            owlBufferResize(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.size());
-            owlBufferUpload(OD.surfaceInstanceToEntityBuffer, surfaceInstanceToEntity.data());
+            owlInstanceGroupSetTransforms(OD.IAS,0,(const float*)t0OwlTransforms.data());
+            owlInstanceGroupSetTransforms(OD.IAS,1,(const float*)t1OwlTransforms.data());
+            owlInstanceGroupSetVisibilityMasks(OD.IAS, owlVisibilityMasks.data());
+            owlBufferResize(OD.instanceToEntityBuffer, instanceToEntity.size());
+            owlBufferUpload(OD.instanceToEntityBuffer, instanceToEntity.data());
         }       
 
-        // Set volume transforms to IAS, upload volume instance to entity map
-        if (volumeInstances.size() > 0) {
-            OD.volumesIAS = instanceGroupCreate(OD.context, volumeInstances.size());
-            for (uint32_t iid = 0; iid < volumeInstances.size(); ++iid) {
-                instanceGroupSetChild(OD.volumesIAS, iid, volumeInstances[iid]);                 
-                t0OwlVolumeTransforms.push_back(glmToOWL(t0VolumeTransforms[iid]));
-                t1OwlVolumeTransforms.push_back(glmToOWL(t1VolumeTransforms[iid]));
-                owlVolumeVisibilityMasks.push_back(volumeMasks[iid]);
-            }            
-            owlInstanceGroupSetTransforms(OD.volumesIAS,0,(const float*)t0OwlVolumeTransforms.data());
-            owlInstanceGroupSetTransforms(OD.volumesIAS,1,(const float*)t1OwlVolumeTransforms.data());
-            owlInstanceGroupSetVisibilityMasks(OD.volumesIAS, owlVolumeVisibilityMasks.data());
-            owlBufferResize(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.size());
-            owlBufferUpload(OD.volumeInstanceToEntityBuffer, volumeInstanceToEntity.data());
-        }
-
         // Build IAS
-        groupBuildAccel(OD.volumesIAS);
-        owlParamsSetGroup(OD.launchParams, "volumesIAS", OD.volumesIAS);
-        groupBuildAccel(OD.surfacesIAS);
-        owlParamsSetGroup(OD.launchParams, "surfacesIAS", OD.surfacesIAS);
-
+        groupBuildAccel(OD.IAS);
+        owlParamsSetGroup(OD.launchParams, "IAS", OD.IAS);
+        
         // Now that IAS have changed, we need to rebuild SBT
         owlBuildSBT(OD.context);
 
         // Release any old IAS (TODO, don't rebuild if entity edit doesn't effect IAS...)
-        if (oldSurfaceIAS) {owlGroupRelease(oldSurfaceIAS);}
-        if (oldVolumeIAS) {owlGroupRelease(oldVolumeIAS);}
+        if (oldIAS) {owlGroupRelease(oldIAS);}
     
         // Aggregate entities that are light sources (todo: consider emissive volumes...)
         OD.lightEntities.resize(0);
@@ -1608,19 +1513,6 @@ void updateComponents()
     auto dirtyTransforms = Transform::getDirtyTransforms();
     if (dirtyTransforms.size() > 0) {
         Transform::updateComponents();
-        
-        // // for each device
-        // for (uint32_t id = 0; id < owlGetDeviceCount(OptixData.context); ++id)
-        // {
-        //     cudaSetDevice(id);
-
-        //     TransformStruct* devTransforms = (TransformStruct*)owlBufferGetPointer(OptixData.transformBuffer, id);
-        //     TransformStruct* transformStructs = Transform::getFrontStruct();
-        //     for (auto &t : dirtyTransforms) {
-        //         if (!t->isInitialized()) continue;
-        //         CUDA_CHECK(cudaMemcpy(&devTransforms[t->getAddress()], &transformStructs[t->getAddress()], sizeof(TransformStruct), cudaMemcpyHostToDevice));
-        //     }
-        // }
 
         // cudaSetDevice(0);
         owlBufferUpload(OptixData.transformBuffer, Transform::getFrontStruct());
@@ -2450,20 +2342,6 @@ void renderToFile(uint32_t width, uint32_t height, uint32_t samplesPerPixel, std
     }
 }
 
-// void renderDataToPNG(uint32_t width, uint32_t height, uint32_t startFrame, uint32_t frameCount, uint32_t bounce, std::string field, std::string imagePath)
-// {
-//     std::vector<float> fb = renderData(width, height, startFrame, frameCount, bounce, field);
-//     std::vector<uint8_t> colors(4 * width * height);
-//     for (size_t i = 0; i < (width * height); ++i) {       
-//         colors[i * 4 + 0] = uint8_t(glm::clamp(fb[i * 4 + 0] * 255.f, 0.f, 255.f));
-//         colors[i * 4 + 1] = uint8_t(glm::clamp(fb[i * 4 + 1] * 255.f, 0.f, 255.f));
-//         colors[i * 4 + 2] = uint8_t(glm::clamp(fb[i * 4 + 2] * 255.f, 0.f, 255.f));
-//         colors[i * 4 + 3] = uint8_t(glm::clamp(fb[i * 4 + 3] * 255.f, 0.f, 255.f));
-//     }
-//     stbi_flip_vertically_on_write(true);
-//     stbi_write_png(imagePath.c_str(), width, height, /* num channels*/ 4, colors.data(), /* stride in bytes */ width * 4);
-// }
-
 void initializeComponentFactories(
     uint32_t maxEntities, 
     uint32_t maxCameras, 

From 94316a6320ee8c92532349b8cdc69a97e387025d Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 14 Apr 2021 21:35:12 -0600
Subject: [PATCH 18/55] refacting volume rendering code. Now much cheaper to
 render surfaces without any volumes present

---
 externals/owl                        |   2 +-
 include/nvisii/entity_struct.h       |  14 +-
 src/nvisii/devicecode/path_tracer.cu | 614 +++++++++++++--------------
 src/nvisii/entity.cpp                |   1 +
 src/nvisii/nvisii.cpp                |   3 -
 5 files changed, 313 insertions(+), 321 deletions(-)

diff --git a/externals/owl b/externals/owl
index f1d01daa..0f82536c 160000
--- a/externals/owl
+++ b/externals/owl
@@ -1 +1 @@
-Subproject commit f1d01daa451151e6ac64c7f9d6662b354b467384
+Subproject commit 0f82536cd56668ca786d4f5e8eb796e6b03752c9
diff --git a/include/nvisii/entity_struct.h b/include/nvisii/entity_struct.h
index 9e77c832..86b0d396 100644
--- a/include/nvisii/entity_struct.h
+++ b/include/nvisii/entity_struct.h
@@ -6,12 +6,12 @@
 
 #ifndef ENTITY_VISIBILITY_FLAGS
 #define ENTITY_VISIBILITY_FLAGS
-#define ENTITY_VISIBILITY_CAMERA_RAYS (1<<0)
-#define ENTITY_VISIBILITY_DIFFUSE_RAYS (1<<1)
-#define ENTITY_VISIBILITY_GLOSSY_RAYS (1<<2)
-#define ENTITY_VISIBILITY_TRANSMISSION_RAYS (1<<3)
-#define ENTITY_VISIBILITY_VOLUME_SCATTER_RAYS (1<<4)
-#define ENTITY_VISIBILITY_SHADOW_RAYS (1<<5)
+#define ENTITY_VISIBILITY_CAMERA_RAYS (1<<0) // object is visible to direct camera rays
+#define ENTITY_VISIBILITY_DIFFUSE_RAYS (1<<1) // object is visible to diffuse rays
+#define ENTITY_VISIBILITY_GLOSSY_RAYS (1<<2) // object is visible to glossy rays
+#define ENTITY_VISIBILITY_TRANSMISSION_RAYS (1<<3) // object is visible to transmission rays
+#define ENTITY_VISIBILITY_VOLUME_SCATTER_RAYS (1<<4) // object is visible to multiple-scattering volume rays
+#define ENTITY_VISIBILITY_SHADOW_RAYS (1<<5) // object is visible to shadow rays (ie, casts shadows)
 #endif
 
 struct EntityStruct {
@@ -22,7 +22,7 @@ struct EntityStruct {
 	int32_t light_id = -1;
 	int32_t mesh_id = -1;
 	int32_t volume_id = -1;
-	int32_t flags = 1;
+	uint32_t flags = (uint32_t)-1;
 	glm::vec4 bbmin = glm::vec4(0.f);
 	glm::vec4 bbmax = glm::vec4(0.f);
 };
\ No newline at end of file
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index e292428b..1def657d 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -263,190 +263,173 @@ OPTIX_CLOSEST_HIT_PROGRAM(VolumeMesh)()
 {   
     auto &LP = optixLaunchParams;
     RayPayload &prd = owl::getPRD<RayPayload>();
-    const auto &self = owl::getProgramData<VolumeGeomData>();
-    LCGRand rng = prd.rng;
-
-    // Load the volume we hit
-    GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
-    uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
-    const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
-    const auto& tree = grid->tree();
-    auto acc = tree.getAccessor();
-
-    auto bbox = acc.root().bbox();    
-    auto mx = bbox.max();
-    auto mn = bbox.min();
-    glm::vec3 offset = glm::vec3(mn[0], mn[1], mn[2]) + 
-                (glm::vec3(mx[0], mx[1], mx[2]) - 
-                glm::vec3(mn[0], mn[1], mn[2])) * .5f;
-
-    float majorant_extinction = acc.root().valueMax();
-    float gradient_factor = volume.gradient_factor;
-    float linear_attenuation_unit = volume.scale;
-    float absorption = volume.absorption;
-    float scattering = volume.scattering;
-
-    vec3 x = make_vec3(prd.objectSpaceRayOrigin) + offset;
-    vec3 w = make_vec3(prd.objectSpaceRayDirection);
-
-    linear_attenuation_unit /= length(w);
-
-    // Move ray to volume boundary
-    float t0 = prd.t0, t1 = prd.t1;
-    x = x + t0 * w;
-    t1 = t1 - t0;
-    t0 = 0.f;
-
-    // Sample the free path distance to see if our ray makes it to the boundary
-    float t;
-    int event;
-    bool hitVolume = false;
-    #define MAX_NULL_COLLISIONS 10000
-    for (int dti = 0; dti < MAX_NULL_COLLISIONS; ++dti) {
-        SampleDeltaTracking(rng, acc, majorant_extinction, linear_attenuation_unit, 
-            absorption, scattering, x, w, t1, t, event);
-        x = x + t * w;
-
-        // The boundary was hit
-        if (event == 0) {
-            break;
-        }
-
-        // An absorption / emission event occurred
-        if (event == 1) {
-            hitVolume = true;
-            break;
-        }
+    // const auto &self = owl::getProgramData<VolumeGeomData>();
+    // LCGRand rng = prd.rng;
+
+    // // Load the volume we hit
+    // GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
+    // uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
+    // const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
+    // const auto& tree = grid->tree();
+    // auto acc = tree.getAccessor();
+
+    // auto bbox = acc.root().bbox();    
+    // auto mx = bbox.max();
+    // auto mn = bbox.min();
+    // glm::vec3 offset = glm::vec3(mn[0], mn[1], mn[2]) + 
+    //             (glm::vec3(mx[0], mx[1], mx[2]) - 
+    //             glm::vec3(mn[0], mn[1], mn[2])) * .5f;
+
+    // float majorant_extinction = acc.root().valueMax();
+    // float gradient_factor = volume.gradient_factor;
+    // float linear_attenuation_unit = volume.scale;
+    // float absorption = volume.absorption;
+    // float scattering = volume.scattering;
+
+    // vec3 x = make_vec3(prd.objectSpaceRayOrigin) + offset;
+    // vec3 w = make_vec3(prd.objectSpaceRayDirection);
+
+    // linear_attenuation_unit /= length(w);
+
+    // // Move ray to volume boundary
+    // float t0 = prd.t0, t1 = prd.t1;
+    // x = x + t0 * w;
+    // t1 = t1 - t0;
+    // t0 = 0.f;
+
+    // // Sample the free path distance to see if our ray makes it to the boundary
+    // float t;
+    // int event;
+    // bool hitVolume = false;
+    // #define MAX_NULL_COLLISIONS 10000
+    // for (int dti = 0; dti < MAX_NULL_COLLISIONS; ++dti) {
+    //     SampleDeltaTracking(rng, acc, majorant_extinction, linear_attenuation_unit, 
+    //         absorption, scattering, x, w, t1, t, event);
+    //     x = x + t * w;
+
+    //     // The boundary was hit
+    //     if (event == 0) {
+    //         break;
+    //     }
+
+    //     // An absorption / emission event occurred
+    //     if (event == 1) {
+    //         hitVolume = true;
+    //         break;
+    //     }
+
+    //     // A scattering event occurred
+    //     if (event == 2) {
+    //         hitVolume = true;
+    //         break;
+    //     }
+
+    //     // A null collision occurred.
+    //     if (event == 3) {
+    //         // update boundary in relation to the new collision x, w does not change.
+    //         t1 = t1 - t;
+    //     }
+    // }
 
-        // A scattering event occurred
-        if (event == 2) {
-            hitVolume = true;
-            break;
-        }
+    optixGetObjectToWorldTransformMatrix(prd.localToWorld);
 
-        // A null collision occurred.
-        if (event == 3) {
-            // update boundary in relation to the new collision x, w does not change.
-            t1 = t1 - t;
-        }
-    }
+    // If we don't need motion vectors, (or in the future if an object 
+    // doesn't have motion blur) then return.
+    if (LP.renderDataMode == RenderDataFlags::NONE) return;
 
-    if (!hitVolume) {
-        prd.tHit = -1.f;
-    }
-    else {
-        prd.instanceID = optixGetInstanceIndex();
-        prd.eventID = event;
-        prd.tHit = t;
-
-        auto sampler = nanovdb::SampleFromVoxels<nanovdb::DefaultReadAccessor<float>, /*Interpolation Degree*/1, /*UseCache*/false>(acc);
-        auto coord_pos = nanovdb::Coord::Floor( nanovdb::Vec3f(x.x, x.y, x.z) );
-        float densityValue = acc.getValue(coord_pos);
-        auto g = sampler.gradient(nanovdb::Vec3f(x.x, x.y, x.z)); 
-
-        prd.mp = make_float3(x - offset); // not super confident about this offset...
-        prd.gradient = make_float3(g[0], g[1], g[2]);// TEMPORARY FOR BUNNY
-        prd.density = densityValue;
-        optixGetObjectToWorldTransformMatrix(prd.localToWorld);
-    
-        // If we don't need motion vectors, (or in the future if an object 
-        // doesn't have motion blur) then return.
-        if (LP.renderDataMode == RenderDataFlags::NONE) return;
+    OptixTraversableHandle handle = optixGetTransformListHandle(prd.instanceID);
+    float4 trf00, trf01, trf02;
+    float4 trf10, trf11, trf12;
     
-        OptixTraversableHandle handle = optixGetTransformListHandle(prd.instanceID);
-        float4 trf00, trf01, trf02;
-        float4 trf10, trf11, trf12;
-        
-        optix_impl::optixGetInterpolatedTransformationFromHandle( trf00, trf01, trf02, handle, /* time */ 0.f, true );
-        optix_impl::optixGetInterpolatedTransformationFromHandle( trf10, trf11, trf12, handle, /* time */ 1.f, true );
-        memcpy(&prd.localToWorldT0[0], &trf00, sizeof(trf00));
-        memcpy(&prd.localToWorldT0[4], &trf01, sizeof(trf01));
-        memcpy(&prd.localToWorldT0[8], &trf02, sizeof(trf02));
-        memcpy(&prd.localToWorldT1[0], &trf10, sizeof(trf10));
-        memcpy(&prd.localToWorldT1[4], &trf11, sizeof(trf11));
-        memcpy(&prd.localToWorldT1[8], &trf12, sizeof(trf12));
-    }
+    optix_impl::optixGetInterpolatedTransformationFromHandle( trf00, trf01, trf02, handle, /* time */ 0.f, true );
+    optix_impl::optixGetInterpolatedTransformationFromHandle( trf10, trf11, trf12, handle, /* time */ 1.f, true );
+    memcpy(&prd.localToWorldT0[0], &trf00, sizeof(trf00));
+    memcpy(&prd.localToWorldT0[4], &trf01, sizeof(trf01));
+    memcpy(&prd.localToWorldT0[8], &trf02, sizeof(trf02));
+    memcpy(&prd.localToWorldT1[0], &trf10, sizeof(trf10));
+    memcpy(&prd.localToWorldT1[4], &trf11, sizeof(trf11));
+    memcpy(&prd.localToWorldT1[8], &trf12, sizeof(trf12));
 }
 
 OPTIX_CLOSEST_HIT_PROGRAM(VolumeShadowRay)()
 {
-    auto &LP = optixLaunchParams;
-    const auto &self = owl::getProgramData<VolumeGeomData>();
-    RayPayload &prd = owl::getPRD<RayPayload>();
-    LCGRand rng = prd.rng;
-
-    GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
-    uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
-    const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
-    const auto& tree = grid->tree();
-    auto acc = tree.getAccessor();
-
-    auto bbox = acc.root().bbox();    
-    auto mx = bbox.max();
-    auto mn = bbox.min();
-    glm::vec3 offset = glm::vec3(mn[0], mn[1], mn[2]) + 
-                (glm::vec3(mx[0], mx[1], mx[2]) - 
-                glm::vec3(mn[0], mn[1], mn[2])) * .5f;
-
-    float majorant_extinction = acc.root().valueMax();
-    float gradient_factor = volume.gradient_factor;
-    float linear_attenuation_unit = volume.scale;
-    float absorption = volume.absorption;
-    float scattering = volume.scattering;
-
-    vec3 x = make_vec3(prd.objectSpaceRayOrigin) + offset;
-    vec3 w = make_vec3(prd.objectSpaceRayDirection);
-
-    linear_attenuation_unit /= length(w);
-
-    // Move ray to volume boundary
-    float t0 = prd.t0, t1 = prd.t1;
-    x = x + t0 * w;
-    t1 = t1 - t0;
-    t0 = 0.f;
-
-    // Sample the free path distance to see if our ray makes it to the boundary
-    float t;
-    int event;
-    bool hitVolume = false;
-    #define MAX_NULL_COLLISIONS 10000
-    for (int dti = 0; dti < MAX_NULL_COLLISIONS; ++dti) {
-        SampleDeltaTracking(rng, acc, majorant_extinction, linear_attenuation_unit, 
-            absorption, scattering, x, w, t1, t, event);
-        x = x + t * w;
-
-        // The boundary was hit
-        if (event == 0) {
-            break;
-        }
-
-        // An absorption / emission event occurred
-        if (event == 1) {
-            hitVolume = true;
-            break;
-        }
-
-        // A scattering event occurred
-        if (event == 2) {
-            hitVolume = true;
-            break;
-        }
-
-        // A null collision occurred.
-        if (event == 3) {
-            // update boundary in relation to the new collision x, w does not change.
-            t1 = t1 - t;
-        }
-    }
+    // auto &LP = optixLaunchParams;
+    // const auto &self = owl::getProgramData<VolumeGeomData>();
+    // RayPayload &prd = owl::getPRD<RayPayload>();
+    // LCGRand rng = prd.rng;
+
+    // GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
+    // uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
+    // const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
+    // const auto& tree = grid->tree();
+    // auto acc = tree.getAccessor();
+
+    // auto bbox = acc.root().bbox();    
+    // auto mx = bbox.max();
+    // auto mn = bbox.min();
+    // glm::vec3 offset = glm::vec3(mn[0], mn[1], mn[2]) + 
+    //             (glm::vec3(mx[0], mx[1], mx[2]) - 
+    //             glm::vec3(mn[0], mn[1], mn[2])) * .5f;
+
+    // float majorant_extinction = acc.root().valueMax();
+    // float gradient_factor = volume.gradient_factor;
+    // float linear_attenuation_unit = volume.scale;
+    // float absorption = volume.absorption;
+    // float scattering = volume.scattering;
+
+    // vec3 x = make_vec3(prd.objectSpaceRayOrigin) + offset;
+    // vec3 w = make_vec3(prd.objectSpaceRayDirection);
+
+    // linear_attenuation_unit /= length(w);
+
+    // // Move ray to volume boundary
+    // float t0 = prd.t0, t1 = prd.t1;
+    // x = x + t0 * w;
+    // t1 = t1 - t0;
+    // t0 = 0.f;
+
+    // // Sample the free path distance to see if our ray makes it to the boundary
+    // float t;
+    // int event;
+    // bool hitVolume = false;
+    // #define MAX_NULL_COLLISIONS 10000
+    // for (int dti = 0; dti < MAX_NULL_COLLISIONS; ++dti) {
+    //     SampleDeltaTracking(rng, acc, majorant_extinction, linear_attenuation_unit, 
+    //         absorption, scattering, x, w, t1, t, event);
+    //     x = x + t * w;
+
+    //     // The boundary was hit
+    //     if (event == 0) {
+    //         break;
+    //     }
+
+    //     // An absorption / emission event occurred
+    //     if (event == 1) {
+    //         hitVolume = true;
+    //         break;
+    //     }
+
+    //     // A scattering event occurred
+    //     if (event == 2) {
+    //         hitVolume = true;
+    //         break;
+    //     }
+
+    //     // A null collision occurred.
+    //     if (event == 3) {
+    //         // update boundary in relation to the new collision x, w does not change.
+    //         t1 = t1 - t;
+    //     }
+    // }
 
-    if (!hitVolume) {
-        prd.tHit = -1.f;
-    }
-    else {
-        prd.instanceID = optixGetInstanceIndex();
-        prd.eventID = event;
-        prd.tHit = t;
-    }
+    // if (!hitVolume) {
+    //     prd.tHit = -1.f;
+    // }
+    // else {
+    //     prd.instanceID = optixGetInstanceIndex();
+    //     prd.eventID = event;
+    //     prd.tHit = t;
+    // }
 }
 
 OPTIX_INTERSECT_PROGRAM(VolumeIntersection)()
@@ -454,7 +437,9 @@ OPTIX_INTERSECT_PROGRAM(VolumeIntersection)()
     // float old_tmax      = optixGetRayTmax();
 
     // const int primID = optixGetPrimitiveIndex();
+    auto &LP = optixLaunchParams;
     const auto &self = owl::getProgramData<VolumeGeomData>();
+    RayPayload &prd = owl::getPRD<RayPayload>();
     float3 origin = optixGetObjectRayOrigin();
 
     // note, this is _not_ normalized. Useful for computing world space tmin/mmax
@@ -492,12 +477,78 @@ OPTIX_INTERSECT_PROGRAM(VolumeIntersection)()
     // clip hit to near position
     thit0 = max(thit0, optixGetRayTmin());
 
-    RayPayload &prd = owl::getPRD<RayPayload>();
-    if (optixReportIntersection(thit0, /* hit kind */ 0)) {
-        prd.objectSpaceRayOrigin = origin;
-        prd.objectSpaceRayDirection = direction;
-        prd.t0 = max(prd.t0, thit0);
-        prd.t1 = min(prd.t1, thit1);
+    // Load the volume we hit
+    GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
+    uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
+    const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
+    const auto& tree = grid->tree();
+    auto acc = tree.getAccessor();
+    auto nvdbSampler = nanovdb::SampleFromVoxels<nanovdb::DefaultReadAccessor<float>, 
+        /*Interpolation Degree*/1, /*UseCache*/false>(acc);
+
+    float majorant_extinction = acc.root().valueMax();
+    float gradient_factor = volume.gradient_factor;
+    float linear_attenuation_unit = volume.scale;
+    float absorption = volume.absorption;
+    float scattering = volume.scattering;
+
+    auto bbox = acc.root().bbox();    
+    auto mx = bbox.max();
+    auto mn = bbox.min();
+    float3 offset = make_float3(glm::vec3(mn[0], mn[1], mn[2]) + 
+                (glm::vec3(mx[0], mx[1], mx[2]) - 
+                glm::vec3(mn[0], mn[1], mn[2])) * .5f);
+
+    // Sample the free path distance to see if our ray makes it to the boundary
+    float t = thit0;
+    int event;
+    bool hitVolume = false;
+    float unit = volume.scale / length(direction);
+    #define MAX_NULL_COLLISIONS 1000
+    for (int i = 0; i < MAX_NULL_COLLISIONS; ++i) {
+        // Sample a distance
+        t = t - (log(1.0f - lcg_randomf(prd.rng)) / majorant_extinction) * unit; 
+
+        // A boundary has been hit, no intersection
+        if (t >= thit1) return;
+
+        // Update current position
+        float3 x = offset + origin + t * direction;
+
+        // Sample heterogeneous media
+        float densityValue = nvdbSampler(nanovdb::Vec3f(x.x, x.y, x.z));
+
+        float a = densityValue * absorption;
+        float s = densityValue * scattering;
+        float e = a + s;
+        float n = majorant_extinction - e;
+
+        a = a / majorant_extinction;
+        s = s / majorant_extinction;
+        n = n / majorant_extinction;
+
+        float event = lcg_randomf(prd.rng);
+        // An absorption/emission collision occured
+        if (event < (a + s)) {
+            if (optixReportIntersection(t, /* hit kind */ 0)) {
+                auto g = nvdbSampler.gradient(nanovdb::Vec3f(x.x, x.y, x.z)); 
+                prd.objectSpaceRayOrigin = origin;
+                prd.objectSpaceRayDirection = direction;
+                prd.eventID = (event < a) ? 1 : 2;
+                prd.instanceID = optixGetInstanceIndex();
+                prd.tHit = t;
+                prd.mp = x - offset; // not super confident about this offset...
+                prd.gradient = make_float3(g[0], g[1], g[2]);// TEMPORARY FOR BUNNY
+                prd.density = densityValue;
+            }
+            return;
+        }
+
+        // A null collision occurred
+        else {
+            event = 3;
+            continue;    	
+        }
     }
 }
 
@@ -930,7 +981,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     float time = sampleTime(lcg_randomf(rng));
 
     // If no camera is in use, just display some random noise...
-    owl::Ray surfRay;
+    owl::Ray ray;
     EntityStruct    camera_entity;
     TransformStruct camera_transform;
     CameraStruct    camera;
@@ -941,8 +992,8 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     }
     
     // Trace an initial ray through the scene
-    surfRay = generateRay(camera, camera_transform, pixelID, make_float2(LP.frameSize), rng, time);
-    surfRay.tmax = tmax;
+    ray = generateRay(camera, camera_transform, pixelID, make_float2(LP.frameSize), rng, time);
+    ray.tmax = tmax;
 
     float3 accum_illum = make_float3(0.f);
     float3 pathThroughput = make_float3(1.f);
@@ -964,25 +1015,13 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     float3 directIllum = make_float3(0.f);
     float3 illum = make_float3(0.f);
     
-    RayPayload surfPayload;
-    surfPayload.tHit = -1.f;
-    surfRay.time = time;    
+    RayPayload payload;
+    payload.tHit = -1.f;
+    ray.time = time;
+    ray.visibilityMask = ENTITY_VISIBILITY_CAMERA_RAYS;
     owl::traceRay(  /*accel to trace against*/ LP.IAS,
-                    /*the ray to trace*/ surfRay,
-                    /*prd*/ surfPayload,
-                    OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-
-    owl::Ray volRay = surfRay;
-    volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
-    RayPayload volPayload;
-    volPayload.tHit = -1.f;
-    volPayload.rng = rng;
-    volPayload.t0 = volRay.tmin;
-    volPayload.t1 = volRay.tmax;
-    volPayload.primitiveID = (debug) ? -2 : -1;
-    owl::traceRay(  /*accel to trace against*/ LP.IAS,
-                    /*the ray to trace*/ volRay,
-                    /*prd*/ volPayload,
+                    /*the ray to trace*/ ray,
+                    /*prd*/ payload,
                     OPTIX_RAY_FLAG_DISABLE_ANYHIT);
     
     // Shade each hit point on a path using NEE with MIS
@@ -990,22 +1029,22 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         float alpha = 0.f;
         
         // If ray misses, terminate the ray
-        if ((surfPayload.tHit <= 0.f) && (volPayload.tHit <= 0.f)) {
+        if (payload.tHit <= 0.f) {
             // Compute lighting from environment
             if (depth == 0) {
-                float3 col = missColor(surfRay, envTex);
+                float3 col = missColor(ray, envTex);
                 illum = illum + pathThroughput * (col * LP.domeLightIntensity);
                 directIllum = illum;
                 primaryAlbedo = col;
             }
             else if (enableDomeSampling)
-                illum = illum + pathThroughput * (missColor(surfRay, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure));
+                illum = illum + pathThroughput * (missColor(ray, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure));
             
             const float envDist = 10000.0f; // large value
             /* Compute miss motion vector */
             float3 mvec;
             // Point far away
-            float3 pFar = surfRay.origin + surfRay.direction * envDist;
+            float3 pFar = ray.origin + ray.direction * envDist;
             // TODO: account for motion from rotating dome light
             vec4 tmp1 = LP.proj * LP.viewT0 * /*xfmt0 **/ make_vec4(pFar, 1.0f);
             float3 pt0 = make_float3(tmp1 / tmp1.w) * .5f;
@@ -1016,15 +1055,13 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             break;
         }
 
-        bool isVolume = (volPayload.tHit >= 0.f);
 
         // Load the object we hit.
-        int entityID;
-        if (isVolume) { GET(entityID, int, LP.instanceToEntity, volPayload.instanceID); }
-        else { GET(entityID, int, LP.instanceToEntity, surfPayload.instanceID); }
-
+        GET(int entityID, int, LP.instanceToEntity, payload.instanceID);
         GET(EntityStruct entity, EntityStruct, LP.entities, entityID);
         GET(TransformStruct transform, TransformStruct, LP.transforms, entity.transform_id);
+
+        bool isVolume = (entity.volume_id != -1);
         MeshStruct mesh;  
         VolumeStruct volume;  
         if (!isVolume) { GET(mesh, MeshStruct, LP.meshes, entity.mesh_id); }
@@ -1032,48 +1069,37 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         
         // Skip forward if the hit object is invisible for this ray type, skip it.
         if (((entity.flags & ENTITY_VISIBILITY_CAMERA_RAYS) == 0)) {
-            surfRay.origin = surfRay.origin + surfRay.direction * (surfPayload.tHit + EPSILON);
-            surfPayload.tHit = -1.f;
-            surfRay.time = time;
-            owl::traceRay( LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-
-            volRay = surfRay;
-            volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
-            volPayload.tHit = -1.f;
-            volPayload.rng = rng;
-            volPayload.t0 = volRay.tmin;
-            volPayload.t1 = volRay.tmax;
-            volPayload.primitiveID = (debug) ? -3 : -1;
-            owl::traceRay( LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+            ray.origin = ray.origin + ray.direction * (payload.tHit + EPSILON);
+            payload.tHit = -1.f;
+            ray.time = time;
+            owl::traceRay( LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
             transparencyDepth++;
             if (transparencyDepth > LP.maxTransparencyDepth) break;
             continue;
         }
 
         // Set new outgoing light direction and hit position.
-        const float3 w_o = -surfRay.direction;
-        float3 hit_p;
-        if (volPayload.tHit >= 0.f) hit_p = volRay.origin + volPayload.tHit * volRay.direction;
-        else hit_p = surfRay.origin + surfPayload.tHit * surfRay.direction;
+        const float3 w_o = -ray.direction;
+        float3 hit_p = ray.origin + payload.tHit * ray.direction;
 
         // Load geometry data for the hit object
         float3 mp, p, v_x, v_y, v_z, v_gz, v_bz; 
         float2 uv; 
         float3 diffuseMotion;
-        if (volPayload.tHit >= 0.f) {
+        if (isVolume) {
             v_x = v_y = make_float3(0.f); // Perhaps I could use divergence / curl here?
-            v_z = v_gz = normalize(volPayload.gradient);
+            v_z = v_gz = normalize(payload.gradient);
             if (any(isnan(make_vec3(v_z)))) v_z = v_gz = make_float3(0.f);
-            mp = volPayload.mp;
-            uv = make_float2(volPayload.density, length(volPayload.gradient));
+            mp = payload.mp;
+            uv = make_float2(payload.density, length(payload.gradient));
         }
         else {
             int3 indices;
-            loadMeshTriIndices(entity.mesh_id, mesh.numTris, surfPayload.primitiveID, indices);
-            loadMeshVertexData(entity.mesh_id, mesh.numVerts, indices, surfPayload.barycentrics, mp, v_gz);
-            loadMeshUVData(entity.mesh_id, mesh.numVerts, indices, surfPayload.barycentrics, uv);
-            loadMeshNormalData(entity.mesh_id, mesh.numVerts, indices, surfPayload.barycentrics, v_z);
-            loadMeshTangentData(entity.mesh_id, mesh.numVerts, indices, surfPayload.barycentrics, v_x);
+            loadMeshTriIndices(entity.mesh_id, mesh.numTris, payload.primitiveID, indices);
+            loadMeshVertexData(entity.mesh_id, mesh.numVerts, indices, payload.barycentrics, mp, v_gz);
+            loadMeshUVData(entity.mesh_id, mesh.numVerts, indices, payload.barycentrics, uv);
+            loadMeshNormalData(entity.mesh_id, mesh.numVerts, indices, payload.barycentrics, v_z);
+            loadMeshTangentData(entity.mesh_id, mesh.numVerts, indices, payload.barycentrics, v_x);
         }
 
         // Load material data for the hit object
@@ -1085,7 +1111,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
       
         // Transform geometry data into world space
         {
-            glm::mat4 xfm = to_mat4((volPayload.tHit >= 0.f) ? volPayload.localToWorld : surfPayload.localToWorld);
+            glm::mat4 xfm = to_mat4(payload.localToWorld);
             p = make_float3(xfm * make_vec4(mp, 1.0f));
             hit_p = p;
             glm::mat3 nxfm = transpose(glm::inverse(glm::mat3(xfm)));
@@ -1096,8 +1122,8 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             v_x = cross(v_y, v_z);
 
             if (LP.renderDataMode != RenderDataFlags::NONE) {
-                glm::mat4 xfmt0 = to_mat4((volPayload.tHit >= 0.f) ? volPayload.localToWorldT0 : surfPayload.localToWorldT0);
-                glm::mat4 xfmt1 = to_mat4((volPayload.tHit >= 0.f) ? volPayload.localToWorldT1 : surfPayload.localToWorldT1);
+                glm::mat4 xfmt0 = to_mat4(payload.localToWorldT0);
+                glm::mat4 xfmt1 = to_mat4(payload.localToWorldT1);
                 vec4 tmp1 = LP.proj * LP.viewT0 * xfmt0 * make_vec4(mp, 1.0f);
                 vec4 tmp2 = LP.proj * LP.viewT1 * xfmt1 * make_vec4(mp, 1.0f);
                 float3 pt0 = make_float3(tmp1 / tmp1.w) * .5f;
@@ -1176,7 +1202,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         }
 
         // For segmentations, save geometric metadata
-        saveGeometricRenderData(renderData, depth, surfPayload.tHit, hit_p, v_z, w_o, uv, entityID, diffuseMotion, time, mat);
+        saveGeometricRenderData(renderData, depth, payload.tHit, hit_p, v_z, w_o, uv, entityID, diffuseMotion, time, mat);
         if (depth == 0) {
             primaryAlbedo = mat.base_color;
             primaryNormal = v_z;
@@ -1187,20 +1213,10 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             float alpha_rnd = lcg_randomf(rng);
 
             if (alpha_rnd > mat.alpha) {
-                surfRay.origin = surfRay.origin + surfRay.direction * (surfPayload.tHit + EPSILON);
-                surfPayload.tHit = -1.f;
-                surfRay.time = time;
-                owl::traceRay( LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-
-                volRay = surfRay;
-                volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
-                volPayload.tHit = -1.f;
-                volPayload.rng = rng;
-                volPayload.t0 = volRay.tmin;
-                volPayload.t1 = volRay.tmax;
-                volPayload.primitiveID = (debug) ? -4 : -1;
-                owl::traceRay( LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-                
+                ray.origin = ray.origin + ray.direction * (payload.tHit + EPSILON);
+                payload.tHit = -1.f;
+                ray.time = time;
+                owl::traceRay( LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);                
                 ++depth;     
                 transparencyDepth++;
                 continue;
@@ -1211,14 +1227,14 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         // Note that NEE/MIS will also potentially terminate the path, preventing double-counting.
         // todo: account for volumetric emission here...
         if (entity.light_id >= 0 && entity.light_id < LP.lights.count) {
-            float dotNWi = max(dot(surfRay.direction, v_z), 0.f);
+            float dotNWi = max(dot(ray.direction, v_z), 0.f);
             if ((dotNWi > EPSILON) && (depth != 0)) break;
 
             GET(LightStruct entityLight, LightStruct, LP.lights, entity.light_id);
             float3 lightEmission;
             if (entityLight.color_texture_id == -1) lightEmission = make_float3(entityLight.r, entityLight.g, entityLight.b);
             else lightEmission = sampleTexture(entityLight.color_texture_id, uv, make_float3(0.f, 0.f, 0.f));
-            float dist = surfPayload.tHit;
+            float dist = payload.tHit;
             lightEmission = (lightEmission * entityLight.intensity);
             if (depth != 0) lightEmission = (lightEmission * pow(2.f, entityLight.exposure)) / max((dist * dist), 1.f);
             float3 contribution = pathThroughput * lightEmission;
@@ -1233,7 +1249,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         float3 irradiance = make_float3(0.f);
 
         // If we hit a volume, use hybrid scattering to determine whether or not to use a BRDF or a phase function.
-        if (volPayload.tHit >= 0.f) {
+        if (isVolume) {
             float opacity = mat.alpha; // would otherwise be sampled from a transfer function
             float grad_len = uv.y;
             float p_brdf = opacity * (1.f - exp(-25.f * pow(volume.gradient_factor, 3.f) * grad_len));
@@ -1257,7 +1273,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
                 w_i, bsdfPDF, sampledBsdf, bsdf);         // outputs
         } else {
             /* a scatter event occurred */
-            if (volPayload.eventID == 2) {
+            if (payload.eventID == 2) {
                 // currently isotropic. Todo: implement henyey greenstien...
                 float rand1 = lcg_randomf(rng);
                 float rand2 = lcg_randomf(rng);
@@ -1273,7 +1289,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             } 
 
             /* An absorption / emission event occurred */ 
-            if (volPayload.eventID == 1) {
+            if (payload.eventID == 1) {
                 bsdfPDF = 1.f / (4.0 * M_PI);
                 bsdf = make_float3(1.f / (4.0 * M_PI));
                 w_i = -w_o;
@@ -1286,19 +1302,10 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         // At this point, if we are refracting and we ran out of transmission bounces, skip forward.
         // This avoids creating black regions on glass objects due to bounce limits
         if (sampledBsdf == DISNEY_TRANSMISSION_BRDF && transmissionDepth >= LP.maxTransmissionDepth) {
-            surfRay.origin = surfRay.origin + surfRay.direction * (surfPayload.tHit + EPSILON);
-            surfPayload.tHit = -1.f;
-            surfRay.time = time;
-            owl::traceRay( LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-
-            volRay = surfRay;
-            volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
-            volPayload.tHit = -1.f;
-            volPayload.rng = rng;
-            volPayload.t0 = volRay.tmin;
-            volPayload.t1 = volRay.tmax;
-            volPayload.primitiveID = (debug) ? -4 : -1;
-            owl::traceRay( LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+            ray.origin = ray.origin + ray.direction * (payload.tHit + EPSILON);
+            payload.tHit = -1.f;
+            ray.time = time;
+            owl::traceRay( LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
             
             // Count this as a "transparent" bounce.
             ++depth;     
@@ -1426,32 +1433,27 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         }
         lightPDF *= (1.f / float(numLights + 1.f)) * (1.f / float(numTris));
         if ((lightPDF > 0.0) && (dotNWi > EPSILON)) {
-            RayPayload surfPayload; surfPayload.instanceID = -2;
-            RayPayload volPayload = surfPayload;
+            RayPayload payload; payload.instanceID = -2;
+            RayPayload volPayload = payload;
             owl::RayT</*type*/1, /*prd*/1> ray; // shadow ray
             ray.tmin = EPSILON * 10.f; ray.tmax = lightDistance + EPSILON; // needs to be distance to light, else anyhit logic breaks.
             ray.origin = hit_p; ray.direction = lightDir;
             ray.time = time;
-            owl::traceRay( LP.IAS, ray, surfPayload, occlusion_flags);
-            ray.tmax = (surfPayload.instanceID == -2) ? ray.tmax : surfPayload.tHit;
-            volPayload.rng = rng;
-            volPayload.t0 = volRay.tmin;
-            volPayload.t1 = volRay.tmax;
-            volPayload.primitiveID = (debug) ? -5 : -1;
-            owl::traceRay( LP.IAS, ray, volPayload, occlusion_flags);
+            owl::traceRay( LP.IAS, ray, payload, occlusion_flags);
+            ray.tmax = (payload.instanceID == -2) ? ray.tmax : payload.tHit;
             bool visible;
             if (randomID == numLights) {
                 //  If we sampled the dome light, just check to see if we hit anything
-                visible = (surfPayload.instanceID == -2) && (volPayload.instanceID == -2);
+                visible = (payload.instanceID == -2);
             } else {
                 // If we sampled a light source, then check to see if we hit something other than the light
                 int surfEntity;
-                if (surfPayload.instanceID == -2) surfEntity = -1;
-                else { GET(surfEntity, int, LP.instanceToEntity, surfPayload.instanceID); }
-                visible = (volPayload.instanceID == -2) && (surfPayload.instanceID == -2 || surfEntity == sampledLightID);
+                if (payload.instanceID == -2) surfEntity = -1;
+                else { GET(surfEntity, int, LP.instanceToEntity, payload.instanceID); }
+                visible = (payload.instanceID == -2 || surfEntity == sampledLightID);
             }
             if (visible) {
-                if (randomID != numLights) lightEmission = lightEmission / max(pow(surfPayload.tHit, falloff),1.f);
+                if (randomID != numLights) lightEmission = lightEmission / max(pow(payload.tHit, falloff),1.f);
                 float w = power_heuristic(1.f, lightPDF, 1.f, bsdfPDF);
                 float3 Li = (lightEmission * w) / lightPDF;
                 irradiance = irradiance + (l_bsdf * Li);
@@ -1469,33 +1471,25 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         }
 
         // Next, sample a light source using the importance sampled BDRF direction.
-        surfRay.origin = hit_p;
-        surfRay.direction = w_i;
-        surfRay.tmin = EPSILON;//* 100.f;
-        surfPayload.instanceID = -1;
-        surfPayload.tHit = -1.f;
-        surfRay.time = sampleTime(lcg_randomf(rng));
-        owl::traceRay(LP.IAS, surfRay, surfPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-
-        volRay = surfRay;
-        volRay.tmax = (surfPayload.tHit == -1.f) ? volRay.tmax : surfPayload.tHit;
-        volPayload.rng = rng;
-        volPayload.t0 = volRay.tmin;
-        volPayload.t1 = volRay.tmax;
-        volPayload.primitiveID = (debug) ? -6 : -1;
-        owl::traceRay(LP.IAS, volRay, volPayload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
+        ray.origin = hit_p;
+        ray.direction = w_i;
+        ray.tmin = EPSILON;//* 100.f;
+        payload.instanceID = -1;
+        payload.tHit = -1.f;
+        ray.time = sampleTime(lcg_randomf(rng));
+        owl::traceRay(LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
         // Check if we hit any of the previously sampled lights
         bool hitLight = false;
         if (lightPDF > EPSILON) 
         {
-            float dotNWi = (useBRDF) ? max(dot(surfRay.direction, v_gz), 0.f) : 1.f;  // geometry term
+            float dotNWi = (useBRDF) ? max(dot(ray.direction, v_gz), 0.f) : 1.f;  // geometry term
 
             // if by sampling the brdf we also hit the dome light...
-            if ((surfPayload.instanceID == -1) && (volPayload.instanceID == -1) && (sampledLightID == -1) && enableDomeSampling) {
+            if ((payload.instanceID == -1) && (sampledLightID == -1) && enableDomeSampling) {
                 // Case where we hit the background, and also previously sampled the background   
                 float w = power_heuristic(1.f, bsdfPDF, 1.f, lightPDF);
-                float3 lightEmission = missColor(surfRay, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure);
+                float3 lightEmission = missColor(ray, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure);
                 float3 Li = (lightEmission * w) / bsdfPDF;
                 
                 if (dotNWi > 0.f) {
@@ -1505,8 +1499,8 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             }
             // else if by sampling the brdf we also hit an area light
             // TODO: consider hitting emissive voxels?
-            else if (surfPayload.instanceID != -1 && volPayload.instanceID == -1) {
-                GET(int entityID, int, LP.instanceToEntity, surfPayload.instanceID);
+            else if (payload.instanceID != -1) {
+                GET(int entityID, int, LP.instanceToEntity, payload.instanceID);
                 bool visible = (entityID == sampledLightID);
                 // We hit the light we sampled previously
                 if (visible) {
@@ -1514,10 +1508,10 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
                     GET(EntityStruct light_entity, EntityStruct, LP.entities, sampledLightID);
                     GET(MeshStruct light_mesh, MeshStruct, LP.meshes, light_entity.mesh_id);
                     GET(LightStruct light_light, LightStruct, LP.lights, light_entity.light_id);
-                    loadMeshTriIndices(light_entity.mesh_id, light_mesh.numTris, surfPayload.primitiveID, indices);
-                    loadMeshUVData(light_entity.mesh_id, light_mesh.numVerts, indices, surfPayload.barycentrics, uv);
+                    loadMeshTriIndices(light_entity.mesh_id, light_mesh.numTris, payload.primitiveID, indices);
+                    loadMeshUVData(light_entity.mesh_id, light_mesh.numVerts, indices, payload.barycentrics, uv);
 
-                    float dist = surfPayload.tHit;
+                    float dist = payload.tHit;
                     
                     float3 lightEmission;
                     if (light_light.color_texture_id == -1) lightEmission = make_float3(light_light.r, light_light.g, light_light.b) * (light_light.intensity * pow(2.f, light_light.exposure));
diff --git a/src/nvisii/entity.cpp b/src/nvisii/entity.cpp
index aac5d55c..5bc7df95 100644
--- a/src/nvisii/entity.cpp
+++ b/src/nvisii/entity.cpp
@@ -32,6 +32,7 @@ Entity::Entity(std::string name, uint32_t id) {
 	entity.material_id = -1;
 	entity.light_id = -1;
 	entity.mesh_id = -1;
+	entity.flags = (uint32_t)-1;
 }
 
 std::string Entity::toString()
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 5aea8f48..7f28ef7d 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1220,15 +1220,12 @@ void updateComponents()
             // Next, allocate resources for the new volume.
             auto gridHdlPtr = v->getNanoVDBGridHandle();
             const nanovdb::FloatGrid* grid = reinterpret_cast<nanovdb::FloatGrid*>(gridHdlPtr.get()->data());
-            std::cout<<grid->checksum()<<std::endl;
             nanovdb::isValid(*grid, true, true);
 
             // auto acc = grid->tree().getAccessor();
             // auto bbox = tree.root().bbox();
             auto bbox = grid->tree().bbox().asReal<float>();
             // int nodecount = grid->tree().nodeCount(3);
-            // std::cout<<nodecount<<std::endl;
-            std::cout<<bbox.min()[0]<<bbox.min()[1]<<bbox.min()[2]<<bbox.max()[0]<<bbox.max()[1]<<bbox.max()[2]<<std::endl;
 
             OD.volumeHandles[v->getAddress()] = owlDeviceBufferCreate(OD.context, OWL_USER_TYPE(uint8_t), gridHdlPtr.get()->size(), nullptr);
             owlBufferUpload(OD.volumeHandles[v->getAddress()], gridHdlPtr.get()->data());

From cf6e014dd725c5b6d43cb465e3ef4728de7f4c39 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Thu, 15 Apr 2021 12:31:35 -0600
Subject: [PATCH 19/55] upgrades to volume example

---
 examples/22.volumes.py  | 178 +++++++++++++++++++++++++---------------
 include/nvisii/volume.h |   9 +-
 src/nvisii/volume.cpp   |   7 +-
 3 files changed, 124 insertions(+), 70 deletions(-)

diff --git a/examples/22.volumes.py b/examples/22.volumes.py
index 9a74eb14..3b1878df 100644
--- a/examples/22.volumes.py
+++ b/examples/22.volumes.py
@@ -2,33 +2,31 @@
 
 # 22.volumes.py
 #
-# This shows an example of two volumes. One volume uses the NanoVDB format,
-# and the other is a raw volume.
+# This shows an example of several volumes. Some volume uses the NanoVDB format,
+# others use a raw volume, and then some are generated procedurally.
+# This scene tests how volumes can be lit up with light sources, and how they can 
+# overlap. 
+
+# Note, the API here is subject to change with future versions...
 
 import nvisii
 import numpy as np
 opt = lambda: None
-opt.spp = 50 
-opt.width = 512
-opt.height = 512 
+opt.spp = 512 
+opt.width = 1024
+opt.height = 1024 
 opt.out = '22_volumes.png'
 
-# headless - no window
-# verbose - output number of frames rendered, etc..
 nvisii.initialize(headless = False, verbose = True, window_on_top = True)
-
-#%%
-# Use a neural network to denoise ray traced
 nvisii.enable_denoiser()
 
-# First, lets create an entity that will serve as our camera.
-camera = nvisii.entity.create(name = "camera")
+# Configuring the denoiser here to not use albedo and normal guides, which are 
+# noisy for volumes
+nvisii.configure_denoiser(False, False, True)
 
-# To place the camera into our scene, we'll add a "transform" component.
-# (All nvisii objects have a "name" that can be used for easy lookup later.)
+# Make a camera...
+camera = nvisii.entity.create(name = "camera")
 camera.set_transform(nvisii.transform.create(name = "camera_transform"))
-
-# To make our camera entity act like a "camera", we'll add a camera component
 camera.set_camera(
     nvisii.camera.create_from_fov(
         name = "camera_camera", 
@@ -36,93 +34,131 @@
         aspect = opt.width / float(opt.height)
     )
 )
-
-# Finally, we'll select this entity to be the current camera entity.
-# (nvisii can only use one camera at the time)
 nvisii.set_camera_entity(camera)
+camera.get_transform().look_at(at = (0, 0, .5), up = (0, 0, 1), eye = (0, 5, 2))
+
+# Make a dome light
+env_tex = nvisii.texture.create_from_file("env_tex", "./content/kiara_4_mid-morning_4k.hdr")
+nvisii.enable_dome_light_sampling()
+nvisii.set_dome_light_texture(env_tex, enable_cdf=True)
+nvisii.set_dome_light_exposure(-2.0)
 
-# Lets set the camera to look at an object. 
-# We'll do this by editing the transform component.
-camera.get_transform().look_at(at = (0, 0, .9), up = (0, 0, 1), eye = (0, 5, 1))
 
-# Next, lets at an object (a floor).
+# Make a textured floor
 floor = nvisii.entity.create(
     name = "floor",
     mesh = nvisii.mesh.create_plane("mesh_floor"),
     transform = nvisii.transform.create("transform_floor"),
     material = nvisii.material.create("material_floor")
 )
-
-# Lets make our floor act as a mirror
 mat = floor.get_material()
-# mat = nvisii.material.get("material_floor") # <- this also works
-#%%
-# Mirrors are smooth and "metallic".
-mat.set_base_color((1.,1.,1.)) 
-mat.set_metallic(0) 
-mat.set_roughness(1)
-
-# Make the floor large by scaling it
+floor_tex = nvisii.texture.create_from_file("floor_tex", "./content/salle_de_bain_separated/textures/WoodFloor_BaseColor.jpg")
+mat.set_base_color_texture(floor_tex) 
 trans = floor.get_transform()
 trans.set_scale((5,5,1))
 
-#%%
-# Let's also add a sphere
+# Make a procedural torus volume 
 torus = nvisii.entity.create(
     name="torus",
     volume = nvisii.volume.create_torus("torus"),
     transform = nvisii.transform.create("torus"),
     material = nvisii.material.create("torus")
 )
-#%%
-torus.get_transform().set_position((.5,2,0.35))
+torus.get_transform().set_position((0.8,2,.2))
 torus.get_transform().set_scale((0.003, 0.003, 0.003))
-torus.get_transform().set_angle_axis(3.14 * .25, (1,0,0))
-torus.get_material().set_base_color((.0,0.0,1))  
-torus.get_material().set_roughness(0.0)
-torus.get_material().set_transmission(0.0)
-torus.get_volume().set_gradient_factor(10)
-torus.get_volume().set_absorption(0)
-torus.get_volume().set_scattering(1)
-torus.get_volume().set_scale(50)
-
-#%%
-# Let's also add a bunny
+torus.get_transform().set_angle_axis(nvisii.pi() * .5, (1,0,0))
+torus.get_material().set_base_color((1.,1.,1.0))  
+# The gradient factor here controls how "surface like" the volume is. 
+# Higher values mean "more surface like" in areas where there is a strong 
+# gradient in the scalar field of the volume (which occurs near surfaces defined 
+# by high density regions)
+torus.get_volume().set_gradient_factor(10) 
+
+# Absorption controls the probability of light being absorbed by the volume
+torus.get_volume().set_absorption(1.)
+# Absorption controls the probability of light bouncing off one of the particles in the volume
+torus.get_volume().set_scattering(.0)
+# The scale here controls how "big" a voxel is, where "1" means a voxel is 1cm wide.
+# Larger scales result in particles being distributed over longer distances, 
+# causing the volume to appear less dense
+torus.get_volume().set_scale(100)
+
+# Create a procedural octahedron
+octahedron = nvisii.entity.create(
+    name="octahedron",
+    volume = nvisii.volume.create_octahedron("octahedron"),
+    transform = nvisii.transform.create("octahedron"),
+    material = nvisii.material.create("octahedron")
+)
+octahedron.get_transform().set_position((.80,2.0,0.2)) # Note that this octahedron is inside the torus
+octahedron.get_transform().set_scale((0.01, 0.01, 0.01))
+octahedron.get_transform().set_angle_axis(nvisii.pi() * .25, (0,0,1))
+octahedron.get_material().set_base_color((1.0,0.0,0))  
+octahedron.get_volume().set_gradient_factor(10)
+octahedron.get_volume().set_absorption(0)
+octahedron.get_volume().set_scattering(1)
+octahedron.get_volume().set_scale(15)
+
+# Create a procedural sphere
+sphere = nvisii.entity.create(
+    name="sphere",
+    volume = nvisii.volume.create_sphere("sphere"),
+    transform = nvisii.transform.create("sphere"),
+    material = nvisii.material.create("sphere")
+)
+sphere.get_transform().set_position((-1.0,2,0.25))
+sphere.get_transform().set_scale((0.0025, 0.0025, 0.0025))
+sphere.get_material().set_base_color((0.2,0.2,1.0))  
+sphere.get_volume().set_gradient_factor(10)
+sphere.get_volume().set_absorption(0)
+sphere.get_volume().set_scattering(1)
+sphere.get_volume().set_scale(100)
+
+# Create a procedural box
+box = nvisii.entity.create(
+    name="box",
+    volume = nvisii.volume.create_box("box"),
+    transform = nvisii.transform.create("box"),
+    material = nvisii.material.create("box")
+)
+box.get_transform().set_position((-1.0,2,0.25))
+box.get_transform().set_scale((0.005, 0.005, 0.005))
+box.get_transform().set_angle_axis(.3, (0,0,1))
+box.get_material().set_base_color((1.0,1.0,1.0))  
+box.get_volume().set_gradient_factor(10)
+box.get_volume().set_absorption(0)
+box.get_volume().set_scattering(1)
+box.get_volume().set_scale(100)
+
+# Create a cloudy bunny using a nanovdb file
 bunny = nvisii.entity.create(
     name="bunny",
     volume = nvisii.volume.create_from_file("bunny", "./content/bunny_cloud.nvdb"),
     transform = nvisii.transform.create("bunny"),
     material = nvisii.material.create("bunny")
 )
-#%%
-bunny.get_transform().set_position((-1,0,0.75))
+bunny.get_transform().set_position((-.8,.5,0.75))
 bunny.get_transform().set_scale((0.003, 0.003, 0.003))
 bunny.get_material().set_base_color((0.1,0.9,0.08))  
 bunny.get_material().set_roughness(0.7)   
 bunny.get_volume().set_gradient_factor(10)
-bunny.get_volume().set_absorption(0)
-bunny.get_volume().set_scattering(1)
-bunny.get_volume().set_scale(10)
+bunny.get_volume().set_absorption(1)
+bunny.get_volume().set_scattering(0)
+bunny.get_volume().set_scale(4)
 bunny.get_transform().set_angle_axis(nvisii.pi() * .5, (1,0,0))
 bunny.get_transform().add_angle_axis(nvisii.pi(), (0,1,0))
 
-#%%
+# Create a boston teapot using a raw CT scanned volume
 voxels = np.fromfile("./content/boston_teapot_256x256x178_uint8.raw", dtype=np.uint8).astype(np.float32)
-
-
-
-#%%
-# Let's also add a teapot
 teapot = nvisii.entity.create(
     name="teapot",
     volume = nvisii.volume.create_from_data("teapot", width = 256, height = 256, depth = 178, data = voxels, background = 0.0),
     transform = nvisii.transform.create("teapot"),
     material = nvisii.material.create("teapot")
 )
-#%%
 teapot.get_transform().set_position((1,0,0.7))
 teapot.get_transform().set_scale((0.005, 0.005, 0.005))
-teapot.get_material().set_base_color((1.0,0.0,0.0))  
+teapot.get_material().set_base_color((1.0,1.0,1.0))  
 teapot.get_material().set_roughness(0.0)
 teapot.get_material().set_metallic(1.0)
 teapot.get_volume().set_gradient_factor(100)
@@ -132,15 +168,25 @@
 teapot.get_transform().set_angle_axis(-nvisii.pi() * .5, (1,0,0))
 teapot.get_transform().add_angle_axis(nvisii.pi() * 1.1, (0,1,0))
 
-#%%
-#%%
-# Now that we have a simple scene, let's render it 
-print("rendering to", "01_simple_scene.png")
+# Volumes can be lit up using light sources
+light = nvisii.entity.create(
+    name="light",
+    mesh = nvisii.mesh.create_sphere("light"),
+    transform = nvisii.transform.create("light"),
+    light = nvisii.light.create("light")
+)
+light.get_transform().set_position((0,1,2.5))
+light.get_transform().set_scale((.2,.2,.2))
+light.get_light().set_temperature(4000)
+light.get_light().set_intensity(20)
+
+# Render out the image
+print("rendering to", "22_volumes.png")
 nvisii.render_to_file(
     width = opt.width, 
     height = opt.height, 
     samples_per_pixel = opt.spp,   
-    file_path = "01_simple_scene.png"
+    file_path = "22_volumes.png"
 )
 
 nvisii.deinitialize()
diff --git a/include/nvisii/volume.h b/include/nvisii/volume.h
index d71236d3..7ceddeb8 100644
--- a/include/nvisii/volume.h
+++ b/include/nvisii/volume.h
@@ -78,8 +78,15 @@ class Volume : public StaticFactory
 	 * is 0 and inactive, the interior is active with values varying
 	 * smoothly from 0 at the surface of the box to 1 at the half width
 	 * and interior of the box.
+	 * @param name The name of the volume to create.
+	 * @param size The width, height, and depth of the box in local units.
+	 * @param center The center of the box in local units
+	 * @param half_width The half-width of the narrow band in voxel units
 	 */
-	static Volume *createBox(std::string name);
+	static Volume *createBox(std::string name, 
+		glm::vec3 size = glm::vec3(100.f), 
+		glm::vec3 center = glm::vec3(0.f), 
+		float half_width = 3.f);
 	
 	/**
 	 * Creates a sparse fog volume of an octahedron such that the exterior
diff --git a/src/nvisii/volume.cpp b/src/nvisii/volume.cpp
index 838b7c51..8654e7f6 100644
--- a/src/nvisii/volume.cpp
+++ b/src/nvisii/volume.cpp
@@ -188,10 +188,11 @@ Volume *Volume::createTorus(std::string name)
 	}
 }
 
-Volume *Volume::createBox(std::string name)
+Volume *Volume::createBox(std::string name,
+    glm::vec3 size, glm::vec3 center, float halfWidth)
 {
-    auto create = [] (Volume* v) {
-        nanovdb::GridHandle<> gridHdl = nanovdb::createFogVolumeBox();
+    auto create = [size, center, halfWidth] (Volume* v) {
+        nanovdb::GridHandle<> gridHdl = nanovdb::createFogVolumeBox(size.x, size.y, size.z, ((nanovdb::Vec3R)(0)), 1.0f, halfWidth);
         v->gridHdlPtr = std::make_shared<nanovdb::GridHandle<>>(std::move(gridHdl));
         v->markDirty();
     };

From 86a0503ea5d2f9a65c167edba7e9b672eeae0594 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Thu, 15 Apr 2021 16:05:53 -0600
Subject: [PATCH 20/55] ray visibility flags now working. Added an example
 demonstrating feature

---
 examples/23.ray_visibility.py        | 282 +++++++++++++++++++++++++++
 include/nvisii/entity.h              |   8 +-
 include/nvisii/nvisii.h              |   2 +-
 src/nvisii/devicecode/path_tracer.cu |  19 +-
 4 files changed, 295 insertions(+), 16 deletions(-)
 create mode 100644 examples/23.ray_visibility.py

diff --git a/examples/23.ray_visibility.py b/examples/23.ray_visibility.py
new file mode 100644
index 00000000..5311621f
--- /dev/null
+++ b/examples/23.ray_visibility.py
@@ -0,0 +1,282 @@
+import nvisii
+import math
+import PySide2
+import colorsys
+from PySide2.QtCore import *
+from PySide2.QtWidgets import *
+
+nvisii.initialize()
+nvisii.resize_window(1000,1000)
+nvisii.enable_denoiser()
+# nvisii.configure_denoiser(False, False, True)
+nvisii.set_max_bounce_depth(diffuse_depth=2, glossy_depth = 8, transparency_depth = 8, transmission_depth = 12, volume_depth = 2)
+
+# Set the sky
+nvisii.disable_dome_light_sampling()
+nvisii.set_dome_light_color((0,0,0))
+
+# Set camera
+camera = nvisii.entity.create(
+    name = "camera",
+    transform = nvisii.transform.create(name = "camera_transform"),
+    camera = nvisii.camera.create(
+        name = "camera_camera", 
+        aspect = 1.0
+    )
+)
+camera.get_transform().look_at(
+    at = (0, 0, 0.5), # at position
+    up = (0, 0, 1),   # up vector
+    eye = (0, 5, 2)   # eye position
+)
+nvisii.set_camera_entity(camera)
+
+# Floor
+floor = nvisii.entity.create(
+    name = "floor",
+    mesh = nvisii.mesh.create_plane("mesh_floor"),
+    transform = nvisii.transform.create("transform_floor"),
+    material = nvisii.material.create("material_floor")
+)
+floor.get_material().set_base_color((0.19,0.16,0.19)) 
+floor.get_material().set_metallic(0) 
+floor.get_material().set_roughness(1)
+floor.get_transform().set_scale((5,5,1))
+
+# Mirror 1
+mirror1 = nvisii.entity.create(
+    name = "mirror1",
+    mesh = nvisii.mesh.create_box("mesh_mirror1"),
+    transform = nvisii.transform.create("transform_mirror1"),
+    material = nvisii.material.create("material_mirror1")
+)
+mirror1.get_transform().look_at(eye = (-1.5, -1.5, .5), at = (0,0,.7), up = (0,0,1))
+mirror1.get_material().set_base_color((1.,1.,1.)) 
+mirror1.get_material().set_metallic(1) 
+mirror1.get_material().set_roughness(0)
+mirror1.get_transform().set_scale((.7,.7,.1))
+
+# Glass 1
+glass1 = nvisii.entity.create(
+    name = "glass1",
+    mesh = nvisii.mesh.create_box("mesh_glass1"),
+    transform = nvisii.transform.create("transform_glass1"),
+    material = nvisii.material.create("material_glass1")
+)
+glass1.get_transform().look_at(eye = (1.5, 1.5, .5), at = (0,0,.7), up = (0,0,1))
+glass1.get_material().set_base_color((1.,1.,1.)) 
+glass1.get_material().set_transmission(1) 
+glass1.get_material().set_roughness(0)
+glass1.get_transform().set_scale((.7,.7,.1))
+
+# Mirror 2
+mirror2 = nvisii.entity.create(
+    name = "mirror2",
+    mesh = nvisii.mesh.create_box("mesh_mirror2"),
+    transform = nvisii.transform.create("transform_mirror2"),
+    material = nvisii.material.create("material_mirror2")
+)
+mirror2.get_transform().look_at(eye = (1.5, -1.5, .5), at = (0,0,.7), up = (0,0,1))
+mirror2.get_material().set_base_color((1.,1.,1.)) 
+mirror2.get_material().set_metallic(1) 
+mirror2.get_material().set_roughness(0)
+mirror2.get_transform().set_scale((.7,.7,.1))
+
+# Glass 2
+glass2 = nvisii.entity.create(
+    name = "glass2",
+    mesh = nvisii.mesh.create_box("mesh_glass2"),
+    transform = nvisii.transform.create("transform_glass2"),
+    material = nvisii.material.create("material_glass2")
+)
+glass2.get_transform().look_at(eye = (-1.5, 1.5, .5), at = (0,0,.7), up = (0,0,1))
+glass2.get_material().set_base_color((1.,1.,1.)) 
+glass2.get_material().set_transmission(1) 
+glass2.get_material().set_roughness(0)
+glass2.get_transform().set_scale((.7,.7,.1))
+
+# Fog
+fog = nvisii.entity.create(
+    name = "fog",
+    volume = nvisii.volume.create_box("mesh_fog"),
+    transform = nvisii.transform.create("transform_fog"),
+    material = nvisii.material.create("material_fog")
+)
+fog.get_material().set_base_color((1.,1.,1.)) 
+fog.get_material().set_transmission(1) 
+fog.get_material().set_roughness(0)
+fog.get_volume().set_scale(100)
+
+# Light
+light = nvisii.entity.create(
+    name = "light",
+    light = nvisii.light.create("light"),
+    transform = nvisii.transform.create("light"),
+    mesh = nvisii.mesh.create_sphere("light")
+)
+light.get_transform().set_position((0,0,5))
+light.get_transform().set_scale((.1,.1,.1))
+light.get_light().set_exposure(7)
+
+# Light blocker
+blocker = nvisii.entity.create(
+    name = "blocker",
+    mesh = nvisii.mesh.create_capped_tube("blocker", innerRadius = .04),
+    transform = nvisii.transform.create("blocker"),
+    material = nvisii.material.create("blocker")
+)
+blocker.get_transform().set_scale((10,10,.01))
+blocker.get_transform().set_position((0,0,3.0))
+
+# Teapot
+teapotahedron = nvisii.entity.create(
+    name="teapotahedron",
+    mesh = nvisii.mesh.create_teapotahedron("teapotahedron", segments = 32),
+    transform = nvisii.transform.create("teapotahedron"),
+    material = nvisii.material.create("teapotahedron")
+)
+teapotahedron.get_transform().set_rotation(nvisii.angleAxis(nvisii.pi() / 4.0, (0,0,1)))
+teapotahedron.get_transform().set_position((0,0,0))
+teapotahedron.get_transform().set_scale((0.4, 0.4, 0.4))
+teapotahedron.get_material().set_base_color((255.0 / 255.0, 100.0 / 255.0, 2.0 / 256.0))  
+teapotahedron.get_material().set_roughness(0.0)  
+teapotahedron.get_material().set_specular(1.0) 
+teapotahedron.get_material().set_metallic(1.0) 
+
+# Make a QT window to demonstrate the difference between alpha transparency and transmission
+app = QApplication([]) # Start an application.
+window = QWidget() # Create a window.
+layout = QVBoxLayout() # Create a layout.
+
+def rotateCamera(value):
+    value = value / 100.0
+    cam_pos = camera.get_transform().get_position()
+
+    camera.get_transform().look_at(
+        at = (0, 0, 0.5), # at position
+        up = (0, 0, 1),   # up vector
+        eye = (5 * math.cos(value * 2 * nvisii.pi()), 5 * math.sin(value * 2 * nvisii.pi()), cam_pos[2])   # eye position
+    )
+rotateCamera(0)
+dial = QDial() 
+dial.setWrapping(True)
+dial.valueChanged[int].connect(rotateCamera)
+layout.addWidget(QLabel('Camera rotation')) 
+layout.addWidget(dial) 
+
+def rotateCameraElevation(value):
+    # print(value)
+    value = value / 100
+    cam_pos = camera.get_transform().get_position()
+    camera.get_transform().look_at(
+        at = (0, 0, 0.5), # at position
+        up = (0, 0, 1),   # up vector
+        eye = (cam_pos[0], cam_pos[1], 0.1 + 2.5*value)   # eye position
+    )
+    # print(value, 2 * math.cos(value * 2 * nvisii.pi()))
+
+slider = QSlider(Qt.Horizontal) 
+slider.valueChanged[int].connect(rotateCameraElevation)
+slider.setValue(40)
+layout.addWidget(QLabel('Camera Elevation')) 
+layout.addWidget(slider) 
+
+# Add some toggles to demonstrate how the set_visibility function works
+
+camera_visibility = True
+diffuse_visibility = True
+glossy_visibility = True
+transmission_visibility = True
+scatter_visibility = True
+shadow_visibility = True
+def updateVisibility():
+    global camera_visibility
+    global diffuse_visibility
+    global glossy_visibility
+    global transmission_visibility
+    global scatter_visibility
+    global shadow_visibility
+
+    teapotahedron.set_visibility(
+        camera = camera_visibility, 
+        diffuse = diffuse_visibility, 
+        glossy = glossy_visibility, 
+        transmission = transmission_visibility, 
+        volume_scatter = scatter_visibility, 
+        shadow = shadow_visibility)
+
+def toggleCamera():
+    global camera_visibility
+    camera_visibility = not camera_visibility
+    updateVisibility()
+button = QPushButton("toggleCamera")
+button.clicked.connect(toggleCamera)
+layout.addWidget(button) 
+
+def toggleDiffuse():
+    global diffuse_visibility
+    diffuse_visibility = not diffuse_visibility
+    updateVisibility()
+button = QPushButton("toggleDiffuse")
+button.clicked.connect(toggleDiffuse)
+layout.addWidget(button) 
+
+def toggleGlossy():
+    global glossy_visibility
+    glossy_visibility = not glossy_visibility
+    updateVisibility()
+button = QPushButton("toggleGlossy")
+button.clicked.connect(toggleGlossy)
+layout.addWidget(button) 
+
+def toggleTransmission():
+    global transmission_visibility
+    transmission_visibility = not transmission_visibility
+    updateVisibility()
+button = QPushButton("toggleTransmission")
+button.clicked.connect(toggleTransmission)
+layout.addWidget(button) 
+
+def toggleScattering():
+    global scatter_visibility
+    scatter_visibility = not scatter_visibility
+    updateVisibility()
+button = QPushButton("toggleScattering")
+button.clicked.connect(toggleScattering)
+layout.addWidget(button) 
+
+def toggleShadows():
+    global shadow_visibility
+    shadow_visibility = not shadow_visibility
+    updateVisibility()
+button = QPushButton("toggleShadows")
+button.clicked.connect(toggleShadows)
+layout.addWidget(button) 
+
+def setFogStrength(value):
+    value = (100 - value) * 2 + 10
+    fog.get_volume().set_scale(value)
+setFogStrength(100)
+slider = QSlider(Qt.Horizontal) 
+slider.valueChanged[int].connect(setFogStrength)
+slider.setValue(100)
+layout.addWidget(QLabel('Fog Strength')) 
+layout.addWidget(slider) 
+
+
+def setLightHeight(value):
+    value = value / 100.0
+    light.get_transform().set_position((0,0,3 + value * 2))
+setLightHeight(50)
+slider = QSlider(Qt.Horizontal) 
+slider.valueChanged[int].connect(setLightHeight)
+slider.setValue(50)
+layout.addWidget(QLabel('Light Height')) 
+layout.addWidget(slider) 
+
+
+window.setLayout(layout)
+window.show() 
+app.exec_() 
+
+nvisii.deinitialize()
\ No newline at end of file
diff --git a/include/nvisii/entity.h b/include/nvisii/entity.h
index 9fadab2a..5abcf28a 100644
--- a/include/nvisii/entity.h
+++ b/include/nvisii/entity.h
@@ -212,10 +212,10 @@ class Entity : public StaticFactory {
 	/**
 	 * Objects can be set to be invisible to particular ray types:
 	 * @param camera Makes the object visible to camera rays (the first rays to be traced from the camera).
-	 * @param diffuse (todo...) Makes the object visible to diffuse rays (eg for diffuse GI)
-	 * @param glossy (todo...) Makes the object visible to glossy rays (eg in reflections)
-	 * @param transmission (todo...) Makes the object visible to transmission rays (eg from inside glass)
-	 * @param volume_scatter (todo...) Makes the object visible to volume scatter rays (eg from light simulation inside a volume)
+	 * @param diffuse Makes the object visible to diffuse rays (eg for diffuse GI)
+	 * @param glossy Makes the object visible to glossy rays (eg in reflections)
+	 * @param transmission Makes the object visible to transmission rays (eg from inside glass)
+	 * @param volume_scatter Makes the object visible to volume scatter rays (eg from light simulation inside a volume)
 	 * @param shadow Enables the object to cast shadows.
 	*/
 	void setVisibility(
diff --git a/include/nvisii/nvisii.h b/include/nvisii/nvisii.h
index e14a2a97..dd770fe7 100644
--- a/include/nvisii/nvisii.h
+++ b/include/nvisii/nvisii.h
@@ -207,7 +207,7 @@ void setDirectLightingClamp(float clamp);
  */ 
 void setMaxBounceDepth(
   uint32_t diffuse_depth = 2,
-  uint32_t glossy_depth = 2,
+  uint32_t glossy_depth = 8,
   uint32_t transparency_depth = 8,
   uint32_t transmission_depth = 12,
   uint32_t volume_depth = 2
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index 1def657d..c0fbe130 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -1067,17 +1067,6 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         if (!isVolume) { GET(mesh, MeshStruct, LP.meshes, entity.mesh_id); }
         else { GET(volume, VolumeStruct, LP.volumes, entity.volume_id); }
         
-        // Skip forward if the hit object is invisible for this ray type, skip it.
-        if (((entity.flags & ENTITY_VISIBILITY_CAMERA_RAYS) == 0)) {
-            ray.origin = ray.origin + ray.direction * (payload.tHit + EPSILON);
-            payload.tHit = -1.f;
-            ray.time = time;
-            owl::traceRay( LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
-            transparencyDepth++;
-            if (transparencyDepth > LP.maxTransparencyDepth) break;
-            continue;
-        }
-
         // Set new outgoing light direction and hit position.
         const float3 w_o = -ray.direction;
         float3 hit_p = ray.origin + payload.tHit * ray.direction;
@@ -1216,6 +1205,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
                 ray.origin = ray.origin + ray.direction * (payload.tHit + EPSILON);
                 payload.tHit = -1.f;
                 ray.time = time;
+                // ray.visibilityMask reuses the last visibility mask here
                 owl::traceRay( LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);                
                 ++depth;     
                 transparencyDepth++;
@@ -1305,6 +1295,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             ray.origin = ray.origin + ray.direction * (payload.tHit + EPSILON);
             payload.tHit = -1.f;
             ray.time = time;
+            // ray.visibilityMask reuses the last visibility mask here
             owl::traceRay( LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
             
             // Count this as a "transparent" bounce.
@@ -1439,6 +1430,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             ray.tmin = EPSILON * 10.f; ray.tmax = lightDistance + EPSILON; // needs to be distance to light, else anyhit logic breaks.
             ray.origin = hit_p; ray.direction = lightDir;
             ray.time = time;
+            ray.visibilityMask = ENTITY_VISIBILITY_SHADOW_RAYS;
             owl::traceRay( LP.IAS, ray, payload, occlusion_flags);
             ray.tmax = (payload.instanceID == -2) ? ray.tmax : payload.tHit;
             bool visible;
@@ -1477,6 +1469,11 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         payload.instanceID = -1;
         payload.tHit = -1.f;
         ray.time = sampleTime(lcg_randomf(rng));
+        if (isVolume) ray.visibilityMask = ENTITY_VISIBILITY_VOLUME_SCATTER_RAYS;
+        else if (sampledBsdf == DISNEY_TRANSMISSION_BRDF) ray.visibilityMask = ENTITY_VISIBILITY_TRANSMISSION_RAYS;
+        else if (sampledBsdf == DISNEY_DIFFUSE_BRDF) ray.visibilityMask = ENTITY_VISIBILITY_DIFFUSE_RAYS;
+        else if (sampledBsdf == DISNEY_GLOSSY_BRDF) ray.visibilityMask = ENTITY_VISIBILITY_GLOSSY_RAYS;
+        else if (sampledBsdf == DISNEY_CLEARCOAT_BRDF) ray.visibilityMask = ENTITY_VISIBILITY_GLOSSY_RAYS;
         owl::traceRay(LP.IAS, ray, payload, OPTIX_RAY_FLAG_DISABLE_ANYHIT);
 
         // Check if we hit any of the previously sampled lights

From 1b8a87b69ed78543dc53a9b4d69d8a9b7dc24047 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Thu, 15 Apr 2021 16:29:11 -0600
Subject: [PATCH 21/55] fixing some tests. Normal map test is failing

---
 examples/14.normal_map.py | 6 +++---
 examples/content.txt      | 1 +
 examples/requirements.txt | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/14.normal_map.py b/examples/14.normal_map.py
index dce409ca..f0518799 100644
--- a/examples/14.normal_map.py
+++ b/examples/14.normal_map.py
@@ -103,9 +103,9 @@
 mat.set_specular(0)
 
 # load an example brick texture 
-color_tex = nvisii.texture.create_from_file("color",'content/Bricks051_2K_Color.jpg')
-normal_tex = nvisii.texture.create_from_file("normal",'content/Bricks051_2K_Normal.jpg', linear = True)
-rough_tex = nvisii.texture.create_from_file("rough",'content/Bricks051_2K_Roughness.jpg', linear = True)
+color_tex = nvisii.texture.create_from_file("color",'./content/Bricks051_2K_Color.jpg')
+normal_tex = nvisii.texture.create_from_file("normal",'./content/Bricks051_2K_Normal.jpg', linear = True)
+rough_tex = nvisii.texture.create_from_file("rough",'./content/Bricks051_2K_Roughness.jpg', linear = True)
 
 color_tex.set_scale((.1,.1))
 normal_tex.set_scale((.1,.1))
diff --git a/examples/content.txt b/examples/content.txt
index 1b7882e6..232b1289 100644
--- a/examples/content.txt
+++ b/examples/content.txt
@@ -9,3 +9,4 @@ https://www.dropbox.com/s/22bug1he354oqpt/bmw.zip
 https://www.dropbox.com/s/76gumyy7j0f3cyj/dragon.stl
 https://www.dropbox.com/s/runlp60bjjf3dpu/bunny_cloud.zip
 https://www.dropbox.com/s/nim7jsjiumei4f9/boston_teapot_256x256x178_uint8.zip
+https://www.dropbox.com/s/yybckz6sawq5nbw/TestNormalMap.png
diff --git a/examples/requirements.txt b/examples/requirements.txt
index ebf9dd38..317f777d 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -4,6 +4,6 @@ noise
 numpy
 pillow
 scipy
-pygame
 open3d
-PySide2
\ No newline at end of file
+PySide2
+opencv-python

From 3d34ab5d377529e7a0acd4212c7b7563e4230a02 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Thu, 15 Apr 2021 16:57:26 -0600
Subject: [PATCH 22/55] fixing race condition with textures and materials

---
 src/nvisii/nvisii.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 7f28ef7d..81c45a12 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1375,7 +1375,8 @@ void updateComponents()
 
     // Manage textures and materials
     if (Texture::areAnyDirty() || Material::areAnyDirty()) {
-        std::lock_guard<std::recursive_mutex> material_lock(Material::areAnyDirty()   ? *Material::getEditMutex().get() : dummyMutex);
+        std::lock_guard<std::recursive_mutex> material_lock(*Material::getEditMutex().get());
+        std::lock_guard<std::recursive_mutex> texture_lock(*Texture::getEditMutex().get());
 
         // Allocate cuda textures for all texture components
         auto dirtyTextures = Texture::getDirtyTextures();
@@ -1421,7 +1422,6 @@ void updateComponents()
                     colorSpace
                 );
             }
-            
         }
 
         // Create additional cuda textures for material constants
@@ -1935,6 +1935,8 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
     if ((width < 1) || (height < 1)) throw std::runtime_error("Error, invalid width/height");
     std::vector<float> frameBuffer(width * height * 4);
 
+    enqueueCommandAndWait([](){});
+
     enqueueCommandAndWait([&frameBuffer, width, height, samplesPerPixel, seed] () {
         if (!NVISII.headlessMode) {
             if ((width != WindowData.currentSize.x) || (height != WindowData.currentSize.y))

From 8c3d050c0d87621ef33d884d3c6551c9d39eb9bd Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 24 Apr 2021 11:41:39 -0600
Subject: [PATCH 23/55] adding a get_center function to entities, which behaves
 differently to the aabb center

---
 include/nvisii/entity.h | 6 ++++++
 include/nvisii/mesh.h   | 4 ++--
 src/nvisii/entity.cpp   | 7 +++++++
 src/nvisii/mesh.cpp     | 2 +-
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/include/nvisii/entity.h b/include/nvisii/entity.h
index 9fadab2a..0c53ce24 100644
--- a/include/nvisii/entity.h
+++ b/include/nvisii/entity.h
@@ -236,6 +236,12 @@ class Entity : public StaticFactory {
 	/** @returns the center of the aligned bounding box. Requires a transform and mesh component to be attached. */
 	glm::vec3 getAabbCenter();
 
+	/** 
+	 * @returns the average of the vertices of the mesh in world space, which will lay roughly in the center. Requires 
+	 * a transform and mesh component to be attached. 
+	 */
+	glm::vec3 getCenter();
+
 	/** For internal use. Returns the mutex used to lock entities for processing by the renderer. */
 	static std::shared_ptr<std::recursive_mutex> getEditMutex();
 
diff --git a/include/nvisii/mesh.h b/include/nvisii/mesh.h
index 6bb73cca..ff79da0e 100644
--- a/include/nvisii/mesh.h
+++ b/include/nvisii/mesh.h
@@ -843,8 +843,8 @@ class Mesh : public StaticFactory
         // /* TODO: Explain this */
         // void save_tetrahedralization(float quality_bound, float maximum_volume);
 
-        /** @returns the last computed mesh centroid. */
-        glm::vec3 getCentroid();
+        /** @returns the average of the vertices of the mesh, which will lay roughly in the center. */
+        glm::vec3 getCenter();
 
         /** @returns the minimum axis aligned bounding box position */
         glm::vec3 getMinAabbCorner();
diff --git a/src/nvisii/entity.cpp b/src/nvisii/entity.cpp
index aac5d55c..aa0f50cf 100644
--- a/src/nvisii/entity.cpp
+++ b/src/nvisii/entity.cpp
@@ -318,6 +318,13 @@ glm::vec3 Entity::getAabbCenter()
 	return entityStructs[id].bbmin + (entityStructs[id].bbmax - entityStructs[id].bbmin) * .5f;
 }
 
+glm::vec3 Entity::getCenter()
+{
+	if (!getTransform()) throw std::runtime_error("Error: no transform attached to entity");
+	if (!getMesh()) throw std::runtime_error("Error: no mesh attached to entity");
+	return glm::vec3(getTransform()->getLocalToWorldMatrix() * glm::vec4(getMesh()->getCenter(), 1.f));
+}
+
 void Entity::initializeFactory(uint32_t max_components)
 {
 	if (isFactoryInitialized()) return;
diff --git a/src/nvisii/mesh.cpp b/src/nvisii/mesh.cpp
index 26fb96dc..bf53228d 100644
--- a/src/nvisii/mesh.cpp
+++ b/src/nvisii/mesh.cpp
@@ -141,7 +141,7 @@ void Mesh::computeMetadata()
 	this->meshStructs[id].numVerts = uint32_t(positions.size());
 }
 
-glm::vec3 Mesh::getCentroid()
+glm::vec3 Mesh::getCenter()
 {
 	return vec3(meshStructs[id].center);
 }

From 819e1b44285cb669a71574c32adf0e73d89108db Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 15 May 2021 23:26:14 -0700
Subject: [PATCH 24/55] changing scene importer to allow for degenerate
 transforms with a warning

---
 src/nvisii/nvisii_import_scene.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/nvisii/nvisii_import_scene.cpp b/src/nvisii/nvisii_import_scene.cpp
index 60bbd3a5..1cebd732 100644
--- a/src/nvisii/nvisii_import_scene.cpp
+++ b/src/nvisii/nvisii_import_scene.cpp
@@ -42,9 +42,6 @@ std::string dirnameOf(const std::string& fname)
 
 Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::quat rotation, std::vector<std::string> args)
 {
-    bool updatesEnabled = areUpdatesEnabled();
-
-    disableUpdates();
     std::string directory = dirnameOf(path);
     bool verbose = false;
     bool max_quality = false;
@@ -462,7 +459,16 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         }
         if (verbose) std::cout<< std::string(level, '\t') << "Creating transform " << transformName << std::endl;
         auto transform = Transform::create(transformName);
-        transform->setTransform(aiMatrix4x4ToGlm(&node->mTransformation));
+        try {
+            transform->setTransform(aiMatrix4x4ToGlm(&node->mTransformation));
+        } catch(...) {
+            if (verbose) std::cout<< std::string(level, '\t') << "Warning! transform " << transformName << " Decomposition failed! Is the product of the 4x4 with the determinant of the upper left 3x3 nonzero? See Graphics Gems II: Decomposing a Matrix into Simple Transformations" << std::endl;
+            Transform::remove(transformName);
+            return;
+
+            // transform->setTransform(aiMatrix4x4ToGlm(&node->mTransformation), false);
+            // transform->setScale({0.f, 0.f, 0.f});
+        }
         if (parentTransform == nullptr) {
             transform->setScale(transform->getScale() * scale);
             transform->addRotation(rotation);
@@ -514,7 +520,6 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
     addNode(scene->mRootNode, nullptr, 0);
     aiReleaseImport(scene);
 
-    if (updatesEnabled) enableUpdates();
     if (verbose) std::cout<<"Done!"<<std::endl;
     return nvisiiScene;
 }

From 107a8987d45595464aff9efb1dea490d4305d5ae Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 22 May 2021 13:52:53 -0600
Subject: [PATCH 25/55] adding support for point lights

---
 src/nvisii/devicecode/path_tracer.cu | 234 +++++----------------------
 src/nvisii/nvisii.cpp                |   5 +-
 2 files changed, 44 insertions(+), 195 deletions(-)

diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index c0fbe130..34abd24b 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -244,93 +244,10 @@ void SampleDeltaTracking(
     }
 }
 
-// bool debug = (prd.primitiveID == -2);
-// if (debug) {
-//     if (!  ((mn[0] < x[0]) && (x[0] < mx[0]) && 
-//             (mn[1] < x[1]) && (x[1] < mx[1]) && 
-//             (mn[2] < x[2]) && (x[2] < mx[2]))
-//     ) {
-//         printf("X");
-//     } else {
-//         printf("O");
-//     }
-// }
-// if (debug) {
-//     printf("\n");
-// }
-
 OPTIX_CLOSEST_HIT_PROGRAM(VolumeMesh)()
 {   
     auto &LP = optixLaunchParams;
     RayPayload &prd = owl::getPRD<RayPayload>();
-    // const auto &self = owl::getProgramData<VolumeGeomData>();
-    // LCGRand rng = prd.rng;
-
-    // // Load the volume we hit
-    // GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
-    // uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
-    // const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
-    // const auto& tree = grid->tree();
-    // auto acc = tree.getAccessor();
-
-    // auto bbox = acc.root().bbox();    
-    // auto mx = bbox.max();
-    // auto mn = bbox.min();
-    // glm::vec3 offset = glm::vec3(mn[0], mn[1], mn[2]) + 
-    //             (glm::vec3(mx[0], mx[1], mx[2]) - 
-    //             glm::vec3(mn[0], mn[1], mn[2])) * .5f;
-
-    // float majorant_extinction = acc.root().valueMax();
-    // float gradient_factor = volume.gradient_factor;
-    // float linear_attenuation_unit = volume.scale;
-    // float absorption = volume.absorption;
-    // float scattering = volume.scattering;
-
-    // vec3 x = make_vec3(prd.objectSpaceRayOrigin) + offset;
-    // vec3 w = make_vec3(prd.objectSpaceRayDirection);
-
-    // linear_attenuation_unit /= length(w);
-
-    // // Move ray to volume boundary
-    // float t0 = prd.t0, t1 = prd.t1;
-    // x = x + t0 * w;
-    // t1 = t1 - t0;
-    // t0 = 0.f;
-
-    // // Sample the free path distance to see if our ray makes it to the boundary
-    // float t;
-    // int event;
-    // bool hitVolume = false;
-    // #define MAX_NULL_COLLISIONS 10000
-    // for (int dti = 0; dti < MAX_NULL_COLLISIONS; ++dti) {
-    //     SampleDeltaTracking(rng, acc, majorant_extinction, linear_attenuation_unit, 
-    //         absorption, scattering, x, w, t1, t, event);
-    //     x = x + t * w;
-
-    //     // The boundary was hit
-    //     if (event == 0) {
-    //         break;
-    //     }
-
-    //     // An absorption / emission event occurred
-    //     if (event == 1) {
-    //         hitVolume = true;
-    //         break;
-    //     }
-
-    //     // A scattering event occurred
-    //     if (event == 2) {
-    //         hitVolume = true;
-    //         break;
-    //     }
-
-    //     // A null collision occurred.
-    //     if (event == 3) {
-    //         // update boundary in relation to the new collision x, w does not change.
-    //         t1 = t1 - t;
-    //     }
-    // }
-
     optixGetObjectToWorldTransformMatrix(prd.localToWorld);
 
     // If we don't need motion vectors, (or in the future if an object 
@@ -353,90 +270,10 @@ OPTIX_CLOSEST_HIT_PROGRAM(VolumeMesh)()
 
 OPTIX_CLOSEST_HIT_PROGRAM(VolumeShadowRay)()
 {
-    // auto &LP = optixLaunchParams;
-    // const auto &self = owl::getProgramData<VolumeGeomData>();
-    // RayPayload &prd = owl::getPRD<RayPayload>();
-    // LCGRand rng = prd.rng;
-
-    // GET(VolumeStruct volume, VolumeStruct, LP.volumes, self.volumeID);
-    // uint8_t *hdl = (uint8_t*)LP.volumeHandles.get(self.volumeID, __LINE__).data;
-    // const auto grid = reinterpret_cast<const nanovdb::FloatGrid*>(hdl);
-    // const auto& tree = grid->tree();
-    // auto acc = tree.getAccessor();
-
-    // auto bbox = acc.root().bbox();    
-    // auto mx = bbox.max();
-    // auto mn = bbox.min();
-    // glm::vec3 offset = glm::vec3(mn[0], mn[1], mn[2]) + 
-    //             (glm::vec3(mx[0], mx[1], mx[2]) - 
-    //             glm::vec3(mn[0], mn[1], mn[2])) * .5f;
-
-    // float majorant_extinction = acc.root().valueMax();
-    // float gradient_factor = volume.gradient_factor;
-    // float linear_attenuation_unit = volume.scale;
-    // float absorption = volume.absorption;
-    // float scattering = volume.scattering;
-
-    // vec3 x = make_vec3(prd.objectSpaceRayOrigin) + offset;
-    // vec3 w = make_vec3(prd.objectSpaceRayDirection);
-
-    // linear_attenuation_unit /= length(w);
-
-    // // Move ray to volume boundary
-    // float t0 = prd.t0, t1 = prd.t1;
-    // x = x + t0 * w;
-    // t1 = t1 - t0;
-    // t0 = 0.f;
-
-    // // Sample the free path distance to see if our ray makes it to the boundary
-    // float t;
-    // int event;
-    // bool hitVolume = false;
-    // #define MAX_NULL_COLLISIONS 10000
-    // for (int dti = 0; dti < MAX_NULL_COLLISIONS; ++dti) {
-    //     SampleDeltaTracking(rng, acc, majorant_extinction, linear_attenuation_unit, 
-    //         absorption, scattering, x, w, t1, t, event);
-    //     x = x + t * w;
-
-    //     // The boundary was hit
-    //     if (event == 0) {
-    //         break;
-    //     }
-
-    //     // An absorption / emission event occurred
-    //     if (event == 1) {
-    //         hitVolume = true;
-    //         break;
-    //     }
-
-    //     // A scattering event occurred
-    //     if (event == 2) {
-    //         hitVolume = true;
-    //         break;
-    //     }
-
-    //     // A null collision occurred.
-    //     if (event == 3) {
-    //         // update boundary in relation to the new collision x, w does not change.
-    //         t1 = t1 - t;
-    //     }
-    // }
-
-    // if (!hitVolume) {
-    //     prd.tHit = -1.f;
-    // }
-    // else {
-    //     prd.instanceID = optixGetInstanceIndex();
-    //     prd.eventID = event;
-    //     prd.tHit = t;
-    // }
 }
 
 OPTIX_INTERSECT_PROGRAM(VolumeIntersection)()
 {
-    // float old_tmax      = optixGetRayTmax();
-
-    // const int primID = optixGetPrimitiveIndex();
     auto &LP = optixLaunchParams;
     const auto &self = owl::getProgramData<VolumeGeomData>();
     RayPayload &prd = owl::getPRD<RayPayload>();
@@ -1361,47 +1198,58 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         // sample light sources
         else 
         {
+            // Sample the light to compute an incident light ray to this point
             if (numLights == 0) continue;
             GET( sampledLightID, int, LP.lightEntities, randomID );
             GET( EntityStruct light_entity, EntityStruct, LP.entities, sampledLightID );
             GET( LightStruct light_light, LightStruct, LP.lights, light_entity.light_id );
             GET( TransformStruct transform, TransformStruct, LP.transforms, light_entity.transform_id );
-            GET( MeshStruct mesh, MeshStruct, LP.meshes, light_entity.mesh_id );
-            uint32_t random_tri_id = uint32_t(min(lcg_randomf(rng) * mesh.numTris, float(mesh.numTris - 1)));
-            GET( Buffer<int3> indices, Buffer<int3>, LP.indexLists, light_entity.mesh_id );
-            GET( Buffer<float3> vertices, Buffer<float3>, LP.vertexLists, light_entity.mesh_id );
-            GET( Buffer<float4> normals, Buffer<float4>, LP.normalLists, light_entity.mesh_id );
-            GET( Buffer<float2> texCoords, Buffer<float2>, LP.texCoordLists, light_entity.mesh_id );
-            GET( int3 triIndex, int3, indices, random_tri_id );
-            
-            // Sample the light to compute an incident light ray to this point
+
             auto &ltw = transform.localToWorld;
             float3 dir; float2 uv;
             float3 pos = hit_p;
 
-            GET(float3 n1, float3, normals, triIndex.x );
-            GET(float3 n2, float3, normals, triIndex.y );
-            GET(float3 n3, float3, normals, triIndex.z );
-            GET(float3 v1, float3, vertices, triIndex.x );
-            GET(float3 v2, float3, vertices, triIndex.y );
-            GET(float3 v3, float3, vertices, triIndex.z );
-            GET(float2 uv1, float2, texCoords, triIndex.x );
-            GET(float2 uv2, float2, texCoords, triIndex.y );
-            GET(float2 uv3, float2, texCoords, triIndex.z );
-
-            // Might be a bug here with normal transform...
-            n1 = make_float3(ltw * make_float4(n1, 0.0f));
-            n2 = make_float3(ltw * make_float4(n2, 0.0f));
-            n3 = make_float3(ltw * make_float4(n3, 0.0f));
-            v1 = make_float3(ltw * make_float4(v1, 1.0f));
-            v2 = make_float3(ltw * make_float4(v2, 1.0f));
-            v3 = make_float3(ltw * make_float4(v3, 1.0f));
-            sampleTriangle(pos, n1, n2, n3, v1, v2, v3, uv1, uv2, uv3, 
-                lcg_randomf(rng), lcg_randomf(rng), dir, lightDistance, lightPDF, uv, 
-                /*double_sided*/ false, /*use surface area*/ light_light.use_surface_area);
+            // The sampled light is a point light
+            if ((light_entity.mesh_id < 0) || (light_entity.mesh_id >= LP.meshes.count)) {
+                numTris = 1.f;
+                float3 tmp = make_float3(ltw[3]) - pos;
+                lightDistance = length(tmp);
+                dir = tmp / lightDistance;
+                lightPDF = PdfAtoW(1.f/(4.f * M_PI), lightDistance * lightDistance, 1.f);
+                uv = make_float2(0.f, 0.f);
+            } 
+            // The sampled light is a mesh light
+            else {    
+                GET( MeshStruct mesh, MeshStruct, LP.meshes, light_entity.mesh_id );
+                uint32_t random_tri_id = uint32_t(min(lcg_randomf(rng) * mesh.numTris, float(mesh.numTris - 1)));
+                GET( Buffer<int3> indices, Buffer<int3>, LP.indexLists, light_entity.mesh_id );
+                GET( Buffer<float3> vertices, Buffer<float3>, LP.vertexLists, light_entity.mesh_id );
+                GET( Buffer<float4> normals, Buffer<float4>, LP.normalLists, light_entity.mesh_id );
+                GET( Buffer<float2> texCoords, Buffer<float2>, LP.texCoordLists, light_entity.mesh_id );
+                GET( int3 triIndex, int3, indices, random_tri_id );
+                GET(float3 n1, float3, normals, triIndex.x );
+                GET(float3 n2, float3, normals, triIndex.y );
+                GET(float3 n3, float3, normals, triIndex.z );
+                GET(float3 v1, float3, vertices, triIndex.x );
+                GET(float3 v2, float3, vertices, triIndex.y );
+                GET(float3 v3, float3, vertices, triIndex.z );
+                GET(float2 uv1, float2, texCoords, triIndex.x );
+                GET(float2 uv2, float2, texCoords, triIndex.y );
+                GET(float2 uv3, float2, texCoords, triIndex.z );
+                // Might be a bug here with normal transform...
+                n1 = make_float3(ltw * make_float4(n1, 0.0f));
+                n2 = make_float3(ltw * make_float4(n2, 0.0f));
+                n3 = make_float3(ltw * make_float4(n3, 0.0f));
+                v1 = make_float3(ltw * make_float4(v1, 1.0f));
+                v2 = make_float3(ltw * make_float4(v2, 1.0f));
+                v3 = make_float3(ltw * make_float4(v3, 1.0f));
+                sampleTriangle(pos, n1, n2, n3, v1, v2, v3, uv1, uv2, uv3, 
+                    lcg_randomf(rng), lcg_randomf(rng), dir, lightDistance, lightPDF, uv, 
+                    /*double_sided*/ false, /*use surface area*/ light_light.use_surface_area);
+                numTris = mesh.numTris;
+            }
             
             falloff = light_light.falloff;
-            numTris = mesh.numTris;
             lightDir = make_float3(dir.x, dir.y, dir.z);
             if (light_light.color_texture_id == -1) lightEmission = make_float3(light_light.r, light_light.g, light_light.b) * (light_light.intensity * pow(2.f, light_light.exposure));
             else lightEmission = sampleTexture(light_light.color_texture_id, uv, make_float3(0.f, 0.f, 0.f)) * (light_light.intensity * pow(2.f, light_light.exposure));
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 81c45a12..315ff35d 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1148,8 +1148,8 @@ void updateComponents()
     anyUpdated |= Texture::areAnyDirty();
     anyUpdated |= Entity::areAnyDirty();
     anyUpdated |= Volume::areAnyDirty();
-
     if (!anyUpdated) return;
+
     resetAccumulation();
     
     std::recursive_mutex dummyMutex;
@@ -1360,7 +1360,8 @@ void updateComponents()
             if (!entities[eid].isInitialized()) continue;
             if (!entities[eid].getTransform()) continue;
             if (!entities[eid].getLight()) continue;
-            if (!entities[eid].getMesh()) continue;
+            // Edit: adding support for "point" lights that have no meshes
+            // if (!entities[eid].getMesh()) continue; 
             OD.lightEntities.push_back(eid);
         }
         owlBufferResize(OptixData.lightEntitiesBuffer, OD.lightEntities.size());

From 2a08bb66daa29d60c4f5eb1acad4a6884fb6867f Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sat, 22 May 2021 13:54:13 -0600
Subject: [PATCH 26/55] updating docs

---
 include/nvisii/light.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/nvisii/light.h b/include/nvisii/light.h
index 4b957628..99dd7969 100644
--- a/include/nvisii/light.h
+++ b/include/nvisii/light.h
@@ -11,7 +11,9 @@ class Texture;
 /**
  * A "Light" component illuminates objects in a scene. Light components must 
  * be added to an entity with a transform component to have a visible      
- * impact on the scene.                                                    
+ * impact on the scene. Lights attached to entities with no mesh components
+ * act like point lights. Otherwise, lights attached to entities with meshes
+ * will act like mesh lights.                                                  
 */
 class Light : public StaticFactory {
     friend class StaticFactory;

From 7617f80f7fc17a984c310cea0853c732ead618bb Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sun, 23 May 2021 11:37:53 -0600
Subject: [PATCH 27/55] improvements to how import_scene handles normal maps

---
 include/nvisii/mesh.h                 |   6 ++
 include/nvisii/nvisii.h               |   1 +
 include/nvisii/texture.h              |   3 +
 src/nvisii/devicecode/launch_params.h |   3 +-
 src/nvisii/devicecode/path_tracer.cu  |  31 +++++--
 src/nvisii/mesh.cpp                   |  28 ++++++-
 src/nvisii/nvisii.cpp                 |   6 ++
 src/nvisii/nvisii_import_scene.cpp    | 111 +++++++++++++++++++-------
 src/nvisii/texture.cpp                |   5 ++
 9 files changed, 152 insertions(+), 42 deletions(-)

diff --git a/include/nvisii/mesh.h b/include/nvisii/mesh.h
index ff79da0e..91980cc4 100644
--- a/include/nvisii/mesh.h
+++ b/include/nvisii/mesh.h
@@ -699,6 +699,8 @@ class Mesh : public StaticFactory
          * @param position_dimensions The number of floats per position. Valid numbers are 3 or 4.
          * @param normals A list of vertex normals. If indices aren't supplied, this must be a multiple of 3.
          * @param normal_dimensions The number of floats per normal. Valid numbers are 3 or 4.
+         * @param tangents A list of vertex tangents. If indices aren't supplied, this must be a multiple of 3.
+         * @param tangent_dimensions The number of floats per tangent. Valid numbers are 3 or 4.
          * @param colors A list of per-vertex colors. If indices aren't supplied, this must be a multiple of 3.
          * @param color_dimensions The number of floats per color. Valid numbers are 3 or 4.
          * @param texcoords A list of 2D per-vertex texture coordinates. If indices aren't supplied, this must be a multiple of 3.
@@ -712,6 +714,8 @@ class Mesh : public StaticFactory
             uint32_t position_dimensions = 3,
             std::vector<float> normals = std::vector<float>(), 
             uint32_t normal_dimensions = 3, 
+            std::vector<float> tangents = std::vector<float>(), 
+            uint32_t tangent_dimensions = 3, 
             std::vector<float> colors = std::vector<float>(), 
             uint32_t color_dimensions = 4, 
             std::vector<float> texcoords = std::vector<float>(), 
@@ -1012,6 +1016,8 @@ class Mesh : public StaticFactory
             uint32_t position_dimensions,
             std::vector<float> &normals_,
             uint32_t normal_dimensions, 
+            std::vector<float> &tangents_,
+            uint32_t tangent_dimensions, 
             std::vector<float> &colors_, 
             uint32_t color_dimensions,
             std::vector<float> &texcoords_, 
diff --git a/include/nvisii/nvisii.h b/include/nvisii/nvisii.h
index dd770fe7..635ad212 100644
--- a/include/nvisii/nvisii.h
+++ b/include/nvisii/nvisii.h
@@ -305,6 +305,7 @@ void renderToFile(uint32_t width, uint32_t height, uint32_t samples_per_pixel, s
  * "ray_direction" to render the direction that the ray was traced in world space,
  * "position" for rendering out the world space position of the path vertex, 
  * "normal" for rendering out the world space normal of the path vertex, 
+ * "tangent" for rendering out the world space tangent of the path vertex, 
  * "entity_id" for rendering out the entity ID whose surface the path vertex hit, 
  * "base_color" for rendering out the surface base color, 
  * "texture_coordinates" for rendering out the texture coordinates of the hit surface, 
diff --git a/include/nvisii/texture.h b/include/nvisii/texture.h
index 107cb231..6bb64faa 100644
--- a/include/nvisii/texture.h
+++ b/include/nvisii/texture.h
@@ -209,6 +209,9 @@ class Texture : public StaticFactory
 	/** @returns True if the texture is represented linearly. Otherwise, the texture is in sRGB space */
     bool isLinear();
 
+	/** @param is_linear If True, texels will be interpreted as linear space. Otherwise, the texels will be interpreed as sRGB space */
+    void setLinear(bool is_linear);
+
   private:
   	/* TODO */
 	static std::shared_ptr<std::recursive_mutex> editMutex;
diff --git a/src/nvisii/devicecode/launch_params.h b/src/nvisii/devicecode/launch_params.h
index e6142f07..5da1324b 100644
--- a/src/nvisii/devicecode/launch_params.h
+++ b/src/nvisii/devicecode/launch_params.h
@@ -112,7 +112,8 @@ enum RenderDataFlags : uint32_t {
   RAY_DIRECTION = 18,
   HEATMAP = 19,
   TEXTURE_COORDINATES = 20,
-  DEVICE_ID = 21
+  DEVICE_ID = 21,
+  TANGENT = 22
 };
 
 #define MAX_LIGHT_SAMPLES 10
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index 34abd24b..0e0f154a 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -592,16 +592,19 @@ void initializeRenderData(float3 &renderData)
     auto &LP = optixLaunchParams;
     // these might change in the future...
     if (LP.renderDataMode == RenderDataFlags::NONE) {
-        renderData = make_float3(FLT_MAX);
+        renderData = make_float3(0.0f);
     }
     else if (LP.renderDataMode == RenderDataFlags::DEPTH) {
-        renderData = make_float3(FLT_MAX);
+        renderData = make_float3(-FLT_MAX);
     }
     else if (LP.renderDataMode == RenderDataFlags::POSITION) {
-        renderData = make_float3(FLT_MAX);
+        renderData = make_float3(-FLT_MAX);
     }
     else if (LP.renderDataMode == RenderDataFlags::NORMAL) {
-        renderData = make_float3(FLT_MAX);
+        renderData = make_float3(0.0f);
+    }
+    else if (LP.renderDataMode == RenderDataFlags::TANGENT) {
+        renderData = make_float3(0.0f);
     }
     else if (LP.renderDataMode == RenderDataFlags::SCREEN_SPACE_NORMAL) {
         renderData = make_float3(0.0f);
@@ -699,7 +702,7 @@ __device__
 void saveGeometricRenderData(
     float3 &renderData, 
     int bounce, float depth, 
-    float3 w_p, float3 w_n, float3 w_o, float2 uv, 
+    float3 w_p, float3 w_n, float3 w_x, float3 w_o, float2 uv, 
     int entity_id, float3 diffuse_mvec, float time,
     DisneyMaterial &mat)
 {
@@ -716,6 +719,9 @@ void saveGeometricRenderData(
     else if (LP.renderDataMode == RenderDataFlags::NORMAL) {
         renderData = w_n;
     }
+    else if (LP.renderDataMode == RenderDataFlags::TANGENT) {
+        renderData = w_x;
+    }
     else if (LP.renderDataMode == RenderDataFlags::SCREEN_SPACE_NORMAL) {
         glm::quat r0 = glm::quat_cast(LP.viewT0);
         glm::quat r1 = glm::quat_cast(LP.viewT1);
@@ -945,7 +951,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             v_z = make_float3(normalize(nxfm * make_vec3(v_z)));
             v_x = make_float3(normalize(nxfm * make_vec3(v_x)));
             v_y = cross(v_z, v_x);
-            v_x = cross(v_y, v_z);
+            // v_x = cross(v_y, v_z);
 
             if (LP.renderDataMode != RenderDataFlags::NONE) {
                 glm::mat4 xfmt0 = to_mat4(payload.localToWorldT0);
@@ -1028,7 +1034,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         }
 
         // For segmentations, save geometric metadata
-        saveGeometricRenderData(renderData, depth, payload.tHit, hit_p, v_z, w_o, uv, entityID, diffuseMotion, time, mat);
+        saveGeometricRenderData(renderData, depth, payload.tHit, hit_p, v_z, v_x, w_o, uv, entityID, diffuseMotion, time, mat);
         if (depth == 0) {
             primaryAlbedo = mat.base_color;
             primaryNormal = v_z;
@@ -1470,7 +1476,18 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     else {
         // Override framebuffer output if user requested to render metadata
         accum_illum = make_float3(renderData.x, renderData.y, renderData.z);
+        if (isnan(renderData.x) || isnan(renderData.y) || isnan(renderData.z) || 
+            isinf(renderData.x) || isinf(renderData.y) || isinf(renderData.z) ||
+            isnan(prev_color.x) || isnan(prev_color.y) || isnan(prev_color.z) ||
+            isinf(prev_color.x) || isinf(prev_color.y) || isinf(prev_color.z)) {
+            accum_illum = make_float3(0.f, 0.f, 0.f);
+            prev_color = make_float4(0.f, 0.f, 0.f, 1.f);
+        }
         accum_color = make_float4((accum_illum + float(LP.frameID) * make_float3(prev_color)) / float(LP.frameID + 1), 1.0f);
+
+        if (debug) {
+            printf("output: %f %f %f\n", accum_color.x, accum_color.y, accum_color.z);
+        }
     }
     
     
diff --git a/src/nvisii/mesh.cpp b/src/nvisii/mesh.cpp
index bf53228d..83447ad5 100644
--- a/src/nvisii/mesh.cpp
+++ b/src/nvisii/mesh.cpp
@@ -211,6 +211,8 @@ void Mesh::loadData(
 	uint32_t position_dimensions,
 	std::vector<float> &normals_,
 	uint32_t normal_dimensions, 
+	std::vector<float> &tangents_,
+	uint32_t tangent_dimensions, 
 	std::vector<float> &colors_, 
 	uint32_t color_dimensions,
 	std::vector<float> &texcoords_, 
@@ -219,6 +221,7 @@ void Mesh::loadData(
 )
 {
 	bool readingNormals = normals_.size() > 0;
+	bool readingTangents = tangents_.size() > 0;
 	bool readingColors = colors_.size() > 0;
 	bool readingTexCoords = texcoords_.size() > 0;
 	bool readingIndices = indices_.size() > 0;
@@ -228,6 +231,9 @@ void Mesh::loadData(
 	
 	if ((normal_dimensions != 3) && (normal_dimensions != 4)) 
 		throw std::runtime_error( std::string("Error, invalid normal dimensions. Possible normal dimensions are 3 or 4."));
+	
+	if ((tangent_dimensions != 3) && (tangent_dimensions != 4)) 
+		throw std::runtime_error( std::string("Error, invalid tangent dimensions. Possible tangent dimensions are 3 or 4."));
 
 	if ((color_dimensions != 3) && (color_dimensions != 4)) 
 		throw std::runtime_error( std::string("Error, invalid color dimensions. Possible color dimensions are 3 or 4."));
@@ -247,6 +253,9 @@ void Mesh::loadData(
 	if (readingNormals && ((normals_.size() / normal_dimensions) != (positions_.size() / position_dimensions)))
 		throw std::runtime_error( std::string("Error, length mismatch. Total normals: " + std::to_string(normals_.size() / normal_dimensions) + " does not equal total positions: " + std::to_string(positions_.size() / position_dimensions)));
 
+	if (readingTangents && ((tangents_.size() / tangent_dimensions) != (positions_.size() / position_dimensions)))
+		throw std::runtime_error( std::string("Error, length mismatch. Total tangents: " + std::to_string(tangents_.size() / tangent_dimensions) + " does not equal total positions: " + std::to_string(positions_.size() / position_dimensions)));
+
 	if (readingColors && ((colors_.size() / color_dimensions) != (positions_.size() / position_dimensions)))
 		throw std::runtime_error( std::string("Error, length mismatch. Total colors: " + std::to_string(colors_.size() / color_dimensions) + " does not equal total positions: " + std::to_string(positions_.size() / position_dimensions)));
 		
@@ -275,6 +284,12 @@ void Mesh::loadData(
 			vertex.normal.z = normals_[i * normal_dimensions + 2];
 			vertex.normal.w = (normal_dimensions == 4) ? normals_[i * normal_dimensions + 3] : 0.f;
 		}
+		if (readingTangents) {
+			vertex.tangent.x = tangents_[i * tangent_dimensions + 0];
+			vertex.tangent.y = tangents_[i * tangent_dimensions + 1];
+			vertex.tangent.z = tangents_[i * tangent_dimensions + 2];
+			vertex.tangent.w = (tangent_dimensions == 4) ? tangents_[i * tangent_dimensions + 3] : 0.f;
+		}
 		if (readingColors) {
 			vertex.color.x = colors_[i * color_dimensions + 0];
 			vertex.color.y = colors_[i * color_dimensions + 1];
@@ -316,6 +331,7 @@ void Mesh::loadData(
 	this->positions.resize(uniqueVertices.size());
 	this->colors.resize(uniqueVertices.size());
 	this->normals.resize(uniqueVertices.size());
+	this->tangents.resize(uniqueVertices.size());
 	this->texCoords.resize(uniqueVertices.size());
 	for (int i = 0; i < uniqueVertices.size(); ++i)
 	{
@@ -323,6 +339,7 @@ void Mesh::loadData(
 		this->positions[i] = {v.point.x, v.point.y, v.point.z};
 		this->colors[i] = v.color;
 		this->normals[i] = v.normal;
+		this->tangents[i] = v.tangent;
 		this->texCoords[i] = v.texcoord;
 	}
 
@@ -330,6 +347,10 @@ void Mesh::loadData(
 		generateSmoothNormals();
 	}
 
+	if (!readingTangents) {
+		generateSmoothTangents();
+	}
+
 	computeMetadata();
 }
 
@@ -1339,19 +1360,20 @@ Mesh* Mesh::createFromData(
 	uint32_t position_dimensions,
 	std::vector<float> normals_, 
 	uint32_t normal_dimensions, 
+	std::vector<float> tangents_, 
+	uint32_t tangent_dimensions, 
 	std::vector<float> colors_, 
 	uint32_t color_dimensions, 
 	std::vector<float> texcoords_, 
 	uint32_t texcoord_dimensions, 
 	std::vector<uint32_t> indices_
 ) {
-	auto create = [&positions_, position_dimensions, &normals_, normal_dimensions, 
+	auto create = [&positions_, position_dimensions, &normals_, normal_dimensions, &tangents_, tangent_dimensions, 
 				   &colors_, color_dimensions, &texcoords_, texcoord_dimensions, &indices_] 
 				   (Mesh* mesh) 
 	{
-		mesh->loadData(positions_, position_dimensions, normals_, normal_dimensions, 
+		mesh->loadData(positions_, position_dimensions, normals_, normal_dimensions, tangents_, tangent_dimensions, 
 			colors_, color_dimensions, texcoords_, texcoord_dimensions, indices_);
-		mesh->generateSmoothTangents();
 		dirtyMeshes.insert(mesh);
 	};
 	
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 315ff35d..287065fb 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -2062,6 +2062,9 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
         else if (option == std::string("normal")) {
             OptixData.LP.renderDataMode = RenderDataFlags::NORMAL;
         }
+        else if (option == std::string("tangent")) {
+            OptixData.LP.renderDataMode = RenderDataFlags::TANGENT;
+        }
         else if (option == std::string("entity_id")) {
             OptixData.LP.renderDataMode = RenderDataFlags::ENTITY_ID;
         }
@@ -2766,6 +2769,9 @@ void __test__(std::vector<std::string> args) {
     else if (option == std::string("normal")) {
         OptixData.LP.renderDataMode = RenderDataFlags::NORMAL;
     }
+    else if (option == std::string("tangent")) {
+        OptixData.LP.renderDataMode = RenderDataFlags::TANGENT;
+    }
     else if (option == std::string("entity_id")) {
         OptixData.LP.renderDataMode = RenderDataFlags::ENTITY_ID;
     }
diff --git a/src/nvisii/nvisii_import_scene.cpp b/src/nvisii/nvisii_import_scene.cpp
index 1cebd732..aed71804 100644
--- a/src/nvisii/nvisii_import_scene.cpp
+++ b/src/nvisii/nvisii_import_scene.cpp
@@ -255,9 +255,7 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
                 }
             }
         }
-
-        
-        
+ 
         // todo, add texture paths to map above, load later and connect
         if (material->GetTextureCount(aiTextureType_DIFFUSE) > 0) {
             if (material->GetTexture(aiTextureType_DIFFUSE, 0, &Path, NULL, NULL, NULL, NULL, NULL) == AI_SUCCESS) {
@@ -281,6 +279,10 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
                 std::string path = directory + "/" + std::string(Path.C_Str());
                 std::replace(path.begin(), path.end(), '\\', '/');
                 if (texture_map[path]) mat->setNormalMapTexture(texture_map[path]);
+                if (!texture_map[path]->isLinear()) {
+                    if (verbose) std::cout<<"WARNING: normal map texture " << path << " not marked as linear! Forcing texture into linear mode..." << std::endl;
+                    texture_map[path]->setLinear(true);
+                }
             }
         }
 
@@ -323,11 +325,13 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         auto &aiMesh = scene->mMeshes[meshIdx];
         auto &aiVertices = aiMesh->mVertices;
         auto &aiNormals = aiMesh->mNormals;
+        auto &aiTangents = aiMesh->mTangents;
         auto &aiFaces = aiMesh->mFaces;
         auto &aiTextureCoords = aiMesh->mTextureCoords;
 
         std::vector<float> positions;
         std::vector<float> normals;
+        std::vector<float> tangents;
         std::vector<float> texCoords;
         std::vector<uint32_t> indices;
 
@@ -347,6 +351,9 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         if (!aiMesh->HasNormals()) {
             if (verbose) std::cout<<"\tWARNING: mesh " << meshName << " has no normals" << std::endl;
         }
+        if (!aiMesh->HasTangentsAndBitangents()) {
+            if (verbose) std::cout<<"\tWARNING: mesh " << meshName << " has no tangents" << std::endl;
+        }
         if (!aiMesh->HasTextureCoords(0)) {
             if (verbose) std::cout<<"\tWARNING: mesh " << meshName << " has no texture coordinates" << std::endl;
         }
@@ -366,6 +373,12 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
                 v.normal.y = normal.y;
                 v.normal.z = normal.z;
             }
+            if (aiMesh->HasTangentsAndBitangents()) {
+                auto tangent = aiTangents[vid];
+                v.tangent.x = tangent.x;
+                v.tangent.y = tangent.y;
+                v.tangent.z = tangent.z;
+            }
             if (aiMesh->HasTextureCoords(0)) {
                 // just try to take the first texcoord
                 auto texCoord = aiTextureCoords[0][vid];						
@@ -378,6 +391,9 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
             normals.push_back(v.normal.x);
             normals.push_back(v.normal.y);
             normals.push_back(v.normal.z);
+            tangents.push_back(v.tangent.x);
+            tangents.push_back(v.tangent.y);
+            tangents.push_back(v.tangent.z);
             texCoords.push_back(v.texcoord.x);
             texCoords.push_back(v.texcoord.y);
         }
@@ -408,6 +424,7 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
                 meshName, 
                 positions, 3,
                 normals, 3,
+                tangents, 3,
                 /*colors*/{}, 3,
                 texCoords, 2,
                 indices
@@ -419,33 +436,6 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         }
     }
 
-    // load lights
-    for (uint32_t lightIdx = 0; lightIdx < scene->mNumLights; ++lightIdx) {
-        auto light = scene->mLights[lightIdx];
-        if (verbose) {
-            std::cout<<"Found light: " << std::string(light->mName.C_Str()) << std::endl;
-            if (light->mType == aiLightSource_DIRECTIONAL) {
-                std::cout<<"Directional"<<std::endl;
-            } else if (light->mType == aiLightSource_POINT) {
-                std::cout<<"Point"<<std::endl;
-            } else if (light->mType == aiLightSource_SPOT) {
-                std::cout<<"Spot"<<std::endl;
-            } else if (light->mType == aiLightSource_AMBIENT) {
-                std::cout<<"Ambient"<<std::endl;
-            } else if (light->mType == aiLightSource_AREA) {
-                std::cout<<"Area"<<std::endl;
-            } 
-        }
-    }
-
-    // load cameras
-    for (uint32_t cameraIdx = 0; cameraIdx < scene->mNumCameras; ++cameraIdx) {
-        auto camera = scene->mCameras[cameraIdx];
-        if (verbose) {
-            std::cout<<"Found camera: " << std::string(camera->mName.C_Str()) << std::endl;
-        }
-    }
-
     std::function<void(aiNode*, Transform*, int level)> addNode;
     addNode = [&scene, &nvisiiScene, &material_light_map, &addNode, position, rotation, scale, verbose]
         (aiNode* node, Transform* parentTransform, int level) 
@@ -516,8 +506,67 @@ Scene importScene(std::string path, glm::vec3 position, glm::vec3 scale, glm::qu
         for (uint32_t cid = 0; cid < node->mNumChildren; ++cid) 
             addNode(node->mChildren[cid], transform, level+1);
     };
-
     addNode(scene->mRootNode, nullptr, 0);
+
+
+    // load lights
+    for (uint32_t lightIdx = 0; lightIdx < scene->mNumLights; ++lightIdx) {
+        auto &aiLight = scene->mLights[lightIdx];
+        std::string lightName = std::string(aiLight->mName.C_Str());
+        // if (verbose) 
+        {
+            if (verbose) std::cout<<"Found light: " << lightName << std::endl;
+            if (aiLight->mType == aiLightSource_DIRECTIONAL) {
+                if (verbose) std::cout<<"Directional"<<std::endl;
+            } else if (aiLight->mType == aiLightSource_POINT) {
+                if (verbose) std::cout<<"Point"<<std::endl;
+
+                if (!Transform::get(lightName)) {
+                    if (verbose) std::cout<<"Error, can't find transform named " << lightName << std::endl;
+                    continue;
+                }
+                
+                int duplicateCount = 0;
+                std::string entityName = lightName;
+                while (Entity::get(entityName) != nullptr) {
+                    duplicateCount += 1;
+                    entityName += std::to_string(duplicateCount);
+                }    
+                if (verbose) std::cout<< std::string(1, '\t') << "Creating entity " << entityName << " with" <<std::endl;
+                auto entity = Entity::create(entityName);
+                
+                entity->setTransform(Transform::get(lightName));
+                if (verbose) std::cout<< std::string(1, '\t') << "transform: " << lightName <<std::endl;
+
+                duplicateCount = 0;
+                while (Entity::get(lightName) != nullptr) {
+                    duplicateCount += 1;
+                    lightName += std::to_string(duplicateCount);
+                }    
+                if (verbose) std::cout<< std::string(1, '\t') << "light: " << lightName <<std::endl;
+                auto light = Light::create(lightName);
+                entity->setLight(light);
+                light->setColor({aiLight->mColorDiffuse.r, aiLight->mColorDiffuse.g, aiLight->mColorDiffuse.b});
+                
+                nvisiiScene.entities.push_back(entity);
+            } else if (aiLight->mType == aiLightSource_SPOT) {
+                std::cout<<"Spot"<<std::endl;
+            } else if (aiLight->mType == aiLightSource_AMBIENT) {
+                std::cout<<"Ambient"<<std::endl;
+            } else if (aiLight->mType == aiLightSource_AREA) {
+                std::cout<<"Area"<<std::endl;
+            } 
+        }
+    }
+
+    // load cameras
+    for (uint32_t cameraIdx = 0; cameraIdx < scene->mNumCameras; ++cameraIdx) {
+        auto camera = scene->mCameras[cameraIdx];
+        if (verbose) {
+            std::cout<<"Found camera: " << std::string(camera->mName.C_Str()) << std::endl;
+        }
+    }
+
     aiReleaseImport(scene);
 
     if (verbose) std::cout<<"Done!"<<std::endl;
diff --git a/src/nvisii/texture.cpp b/src/nvisii/texture.cpp
index 15063228..beda6b59 100644
--- a/src/nvisii/texture.cpp
+++ b/src/nvisii/texture.cpp
@@ -101,6 +101,11 @@ bool Texture::isLinear() {
     return linear;
 }
 
+void Texture::setLinear(bool is_linear) {
+    linear = is_linear;
+    markDirty();
+}
+
 /* SSBO logic */
 void Texture::initializeFactory(uint32_t max_components)
 {

From 1e8100b42058ec637971f2516613506347632133 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Sun, 23 May 2021 11:39:29 -0600
Subject: [PATCH 28/55] removing debug print in device code

---
 src/nvisii/devicecode/path_tracer.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index 0e0f154a..4b5c896a 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -1485,9 +1485,9 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         }
         accum_color = make_float4((accum_illum + float(LP.frameID) * make_float3(prev_color)) / float(LP.frameID + 1), 1.0f);
 
-        if (debug) {
-            printf("output: %f %f %f\n", accum_color.x, accum_color.y, accum_color.z);
-        }
+        // if (debug) {
+        //     printf("output: %f %f %f\n", accum_color.x, accum_color.y, accum_color.z);
+        // }
     }
     
     

From ef1880aa118cb46f180a4791ee70e4471d0de644 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 22 Sep 2021 14:53:10 -0600
Subject: [PATCH 29/55] adding support for alpha transparent backgrounds

---
 include/nvisii/nvisii.h                    | 22 +++----
 src/externals/glfw_implementation/glfw.cpp |  1 +
 src/nvisii/devicecode/launch_params.h      |  2 +-
 src/nvisii/devicecode/path_tracer.cu       | 52 ++++++++++------
 src/nvisii/nvisii.cpp                      | 72 +++++-----------------
 5 files changed, 59 insertions(+), 90 deletions(-)

diff --git a/include/nvisii/nvisii.h b/include/nvisii/nvisii.h
index 635ad212..3a58f2ed 100644
--- a/include/nvisii/nvisii.h
+++ b/include/nvisii/nvisii.h
@@ -124,9 +124,10 @@ void setDomeLightExposure(float exposure);
 /** 
  * Sets the color which this dome light will emit.
  * 
- * @param The RGB color emitted that this dome light should emit.
+ * @param color The RGB color emitted that this dome light should emit.
+ * @param alpha The alpha transparency to use for the background if hit by primary rays.
  */ 
-void setDomeLightColor(glm::vec3 color);
+void setDomeLightColor(glm::vec3 color, float alpha = 1.0f);
 
 /** 
  * Configures the procedural sky for the dome light (aka the environment).
@@ -137,18 +138,21 @@ void setDomeLightColor(glm::vec3 color);
  * @param atmosphere_thickness effects Rayleigh scattering. Thin atmospheres look more 
  * like space, and thick atmospheres see more Rayleigh scattering.
  * @param saturation causes the sky to appear more or less "vibrant"
+ * @param alpha The alpha transparency to use for the background if hit by primary rays.
  */ 
 void setDomeLightSky(
     glm::vec3 sun_position, 
     glm::vec3 sky_tint = vec3(.5f, .5f, .5f), 
     float atmosphere_thickness = 1.0f,
-    float saturation = 1.0f);
+    float saturation = 1.0f,
+    float alpha = 1.0f);
 
 /** 
  * Sets the texture used to color the dome light (aka the environment). 
  * Textures are sampled using a 2D to 3D latitude/longitude strategy.
  * 
- * @param texture The texture to sample for the dome light.
+ * @param texture The texture to sample for the dome light. Alpha channel values 
+ * effect alpha transparency of background for primary rays. 
  * @param enable_cdf If True, reduces noise of sampling a dome light texture, 
  * but at the expense of frame rate. Useful for dome lights with bright lights 
  * that should cast shadows.
@@ -269,16 +273,6 @@ void configureDenoiser(bool use_albedo_guide = true, bool use_normal_guide = tru
 */
 std::vector<float> render(uint32_t width, uint32_t height, uint32_t samples_per_pixel, uint32_t seed = 0);
 
-/** 
- * Deprecated. Please use renderToFile. 
-*/
-void renderToHDR(uint32_t width, uint32_t height, uint32_t samples_per_pixel, std::string image_path, uint32_t seed = 0);
-
-/** 
- * Deprecated. Please use renderToFile. 
-*/
-void renderToPNG(uint32_t width, uint32_t height, uint32_t samples_per_pixel, std::string image_path, uint32_t seed = 0);
-
 /** 
  * Renders the current scene, saving the resulting framebuffer to an image on disk.
  * 
diff --git a/src/externals/glfw_implementation/glfw.cpp b/src/externals/glfw_implementation/glfw.cpp
index 2753c195..d2564e9f 100644
--- a/src/externals/glfw_implementation/glfw.cpp
+++ b/src/externals/glfw_implementation/glfw.cpp
@@ -103,6 +103,7 @@ namespace Libraries {
         glfwWindowHint(GLFW_DECORATED, (decorated) ? GLFW_TRUE : GLFW_FALSE);
         glfwWindowHint(GLFW_RESIZABLE, (resizable) ? GLFW_TRUE : GLFW_FALSE);
         glfwWindowHint(GLFW_FLOATING, (floating) ? GLFW_TRUE : GLFW_FALSE);
+        glfwWindowHint(GLFW_TRANSPARENT_FRAMEBUFFER, GLFW_TRUE);
         // glfwWindowHint( GLFW_DOUBLEBUFFER, GL_FALSE );
 
         Window window = {};
diff --git a/src/nvisii/devicecode/launch_params.h b/src/nvisii/devicecode/launch_params.h
index 5da1324b..4587d379 100644
--- a/src/nvisii/devicecode/launch_params.h
+++ b/src/nvisii/devicecode/launch_params.h
@@ -32,7 +32,7 @@ struct LaunchParams {
     OptixTraversableHandle IAS;
     float domeLightIntensity = 1.f;
     float domeLightExposure = 0.f;
-    glm::vec3 domeLightColor = glm::vec3(-1.f);
+    glm::vec4 domeLightColor = glm::vec4(-1.f);
     float directClamp = 100.f; 
     float indirectClamp = 100.f; 
     uint32_t maxDiffuseDepth = 2;
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index 4b5c896a..a8837429 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -89,25 +89,27 @@ cudaTextureObject_t getEnvironmentTexture()
 }
 
 inline __device__
-float3 missColor(const float3 n_dir, cudaTextureObject_t &tex)
+float4 missColor(const float3 n_dir, cudaTextureObject_t &tex)
 {
     auto &LP = optixLaunchParams;
     vec3 rayDir = LP.environmentMapRotation * make_vec3(n_dir);
     if (tex)
     {
         vec2 tc = toUV(vec3(rayDir.x, rayDir.y, rayDir.z));
-        float4 texColor = tex2D<float4>(tex, tc.x,tc.y);
-        return make_float3(texColor);
+        return tex2D<float4>(tex, tc.x,tc.y);
     }
     
-    if (glm::any(glm::greaterThanEqual(LP.domeLightColor, glm::vec3(0.f)))) return make_float3(LP.domeLightColor);
+    // If none of the background color channels are negative, return that dome light color.
+    if (glm::any(glm::greaterThanEqual(LP.domeLightColor, glm::vec4(0.f)))) return make_float4(LP.domeLightColor);
+
+    // otherwise, we found a negative value, so revert to some default interpolated background value.
     float t = 0.5f*(rayDir.z + 1.0f);
     float3 c = (1.0f - t) * make_float3(pow(vec3(1.0f), vec3(2.2f))) + t * make_float3( pow(vec3(0.5f, 0.7f, 1.0f), vec3(2.2f)) );
-    return c;
+    return make_float4(c, 1.f);
 }
 
 inline __device__
-float3 missColor(const owl::Ray &ray, cudaTextureObject_t &tex)
+float4 missColor(const owl::Ray &ray, cudaTextureObject_t &tex)
 {
     return missColor(ray.direction, tex);
 }
@@ -843,6 +845,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     float3 renderData = make_float3(0.f);
     float3 primaryAlbedo = make_float3(0.f);
     float3 primaryNormal = make_float3(0.f);
+    float primaryAlpha = 1.f;
     initializeRenderData(renderData);
 
     uint8_t depth = 0;
@@ -875,13 +878,20 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
         if (payload.tHit <= 0.f) {
             // Compute lighting from environment
             if (depth == 0) {
-                float3 col = missColor(ray, envTex);
-                illum = illum + pathThroughput * (col * LP.domeLightIntensity);
+                // don't account for exposure if this is a primary ray. 
+                // (note, this is not physically correct... however, it makes directly visible lights more artist directable. )
+                // (todo, account for lights visible through specular chains...)
+                // Also store primary albedo for denoising
+                float4 col = missColor(ray, envTex);
+                illum = illum + pathThroughput * (make_float3(col) * LP.domeLightIntensity);
                 directIllum = illum;
-                primaryAlbedo = col;
+                primaryAlbedo = make_float3(col);
+                primaryAlpha = col.w;
+            }
+            else if (enableDomeSampling){
+                // else account for exposure of the dome light in addition to intensity.
+                illum = illum + pathThroughput * (make_float3(missColor(ray, envTex)) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure));
             }
-            else if (enableDomeSampling)
-                illum = illum + pathThroughput * (missColor(ray, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure));
             
             const float envDist = 10000.0f; // large value
             /* Compute miss motion vector */
@@ -1199,7 +1209,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             }
 
             numTris = 1.f;
-            lightEmission = (missColor(lightDir, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure));
+            lightEmission = (make_float3(missColor(lightDir, envTex)) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure));
         }
         // sample light sources
         else 
@@ -1340,7 +1350,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             if ((payload.instanceID == -1) && (sampledLightID == -1) && enableDomeSampling) {
                 // Case where we hit the background, and also previously sampled the background   
                 float w = power_heuristic(1.f, bsdfPDF, 1.f, lightPDF);
-                float3 lightEmission = missColor(ray, envTex) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure);
+                float3 lightEmission = make_float3(missColor(ray, envTex)) * LP.domeLightIntensity * pow(2.f, LP.domeLightExposure);
                 float3 Li = (lightEmission * w) / bsdfPDF;
                 
                 if (dotNWi > 0.f) {
@@ -1471,7 +1481,13 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
 
     if (LP.renderDataMode == RenderDataFlags::NONE) 
     {
-        accum_color = make_float4((accum_illum + float(LP.frameID) * make_float3(prev_color)) / float(LP.frameID + 1), 1.0f);
+        // accum_color = (make_float4(accum_illum, primaryAlpha) + float(LP.frameID) * prev_color) / float(LP.frameID + 1);
+        float3 c = (accum_illum + float(LP.frameID) * make_float3(prev_color)) / float(LP.frameID + 1);
+        float a = (primaryAlpha + float(LP.frameID) * prev_color.w) / float(LP.frameID + 1);
+        accum_color.x = c.x;
+        accum_color.y = c.y;
+        accum_color.z = c.z;
+        accum_color.w = a;
     }
     else {
         // Override framebuffer output if user requested to render metadata
@@ -1483,11 +1499,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
             accum_illum = make_float3(0.f, 0.f, 0.f);
             prev_color = make_float4(0.f, 0.f, 0.f, 1.f);
         }
-        accum_color = make_float4((accum_illum + float(LP.frameID) * make_float3(prev_color)) / float(LP.frameID + 1), 1.0f);
-
-        // if (debug) {
-        //     printf("output: %f %f %f\n", accum_color.x, accum_color.y, accum_color.z);
-        // }
+        accum_color = make_float4( (accum_illum + float(LP.frameID) * make_float3(prev_color)) / float(LP.frameID + 1), 1.0f);
     }
     
     
@@ -1496,7 +1508,7 @@ OPTIX_RAYGEN_PROGRAM(rayGen)()
     vec4 oldNormal = make_vec4(prev_normal);
     if (any(isnan(oldAlbedo))) oldAlbedo = vec4(0.f);
     if (any(isnan(oldNormal))) oldNormal = vec4(0.f);
-    vec4 newAlbedo = vec4(primaryAlbedo.x, primaryAlbedo.y, primaryAlbedo.z, 1.f);
+    vec4 newAlbedo = vec4(primaryAlbedo.x, primaryAlbedo.y, primaryAlbedo.z, primaryAlpha);
     vec4 accumAlbedo = (newAlbedo + float(LP.frameID) * oldAlbedo) / float(LP.frameID + 1);
     vec4 newNormal = vec4(make_vec3(primaryNormal), 1.f);
     if (!all(equal(make_vec3(primaryNormal), vec3(0.f, 0.f, 0.f)))) {
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 94c8ed14..2b956277 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -519,7 +519,7 @@ void initializeOptix(bool headless)
         { "instanceToEntity",        OWL_BUFFER,                        OWL_OFFSETOF(LaunchParams, instanceToEntity)},
         { "domeLightIntensity",      OWL_USER_TYPE(float),              OWL_OFFSETOF(LaunchParams, domeLightIntensity)},
         { "domeLightExposure",       OWL_USER_TYPE(float),              OWL_OFFSETOF(LaunchParams, domeLightExposure)},
-        { "domeLightColor",          OWL_USER_TYPE(glm::vec3),          OWL_OFFSETOF(LaunchParams, domeLightColor)},
+        { "domeLightColor",          OWL_USER_TYPE(glm::vec4),          OWL_OFFSETOF(LaunchParams, domeLightColor)},
         { "directClamp",             OWL_USER_TYPE(float),              OWL_OFFSETOF(LaunchParams, directClamp)},
         { "indirectClamp",           OWL_USER_TYPE(float),              OWL_OFFSETOF(LaunchParams, indirectClamp)},
         { "maxDiffuseDepth",         OWL_USER_TYPE(uint32_t),           OWL_OFFSETOF(LaunchParams, maxDiffuseDepth)},
@@ -892,13 +892,15 @@ void setDomeLightExposure(float exposure)
     resetAccumulation();
 }
 
-void setDomeLightColor(vec3 color)
+void setDomeLightColor(vec3 color, float alpha)
 {
     clearDomeLightTexture();
-    color.r = glm::max(0.f, glm::min(color.r, 1.f));
-    color.g = glm::max(0.f, glm::min(color.g, 1.f));
-    color.b = glm::max(0.f, glm::min(color.b, 1.f));
-    OptixData.LP.domeLightColor = color;
+    vec4 c;
+    c.r = glm::max(0.f, glm::min(color.r, 1.f));
+    c.g = glm::max(0.f, glm::min(color.g, 1.f));
+    c.b = glm::max(0.f, glm::min(color.b, 1.f));
+    c.a = glm::max(0.f, glm::min(alpha, 1.f));
+    OptixData.LP.domeLightColor = c;
     resetAccumulation();
 }
 
@@ -938,9 +940,9 @@ vec3 toPolar(vec2 uv)
     return n;
 }
 
-void setDomeLightSky(vec3 sunPos, vec3 skyTint, float atmosphereThickness, float saturation)
+void setDomeLightSky(vec3 sunPos, vec3 skyTint, float atmosphereThickness, float saturation, float alpha)
 {
-    enqueueCommand([sunPos, skyTint, atmosphereThickness, saturation] () {
+    enqueueCommand([sunPos, skyTint, atmosphereThickness, saturation, alpha] () {
         /* Generate procedural sky */
         uint32_t width = 1024/2;
         uint32_t height = 512/2;
@@ -950,7 +952,7 @@ void setDomeLightSky(vec3 sunPos, vec3 skyTint, float atmosphereThickness, float
                 glm::vec2 uv = glm::vec2(x / float(width), y / float(height));
                 glm::vec3 dir = toPolar(uv);
                 glm::vec3 c = ProceduralSkybox(glm::vec3(dir.x, -dir.z, dir.y), glm::vec3(sunPos.x, sunPos.z, sunPos.y), skyTint, atmosphereThickness, saturation);
-                texels[x + y * width] = glm::vec4(c.r, c.g, c.b, 1.0f);
+                texels[x + y * width] = glm::vec4(c.r, c.g, c.b, alpha);
             }
         }
 
@@ -1942,7 +1944,7 @@ std::vector<float> render(uint32_t width, uint32_t height, uint32_t samplesPerPi
                 auto glfw = Libraries::GLFW::Get();
                 glfw->poll_events();
                 glfw->swap_buffers("NVISII");
-                glClearColor(1,1,1,1);
+                glClearColor(0,0,0,0);
                 glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
             }
 
@@ -2111,7 +2113,7 @@ std::vector<float> renderData(uint32_t width, uint32_t height, uint32_t startFra
                 auto glfw = Libraries::GLFW::Get();
                 glfw->poll_events();
                 glfw->swap_buffers("NVISII");
-                glClearColor(1,1,1,1);
+                glClearColor(0,0,0,0);
                 glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
             }
 
@@ -2207,19 +2209,6 @@ void renderDataToFile(uint32_t width, uint32_t height, uint32_t startFrame, uint
     }
 }
 
-static bool renderToHDRDeprecatedShown = false;
-void renderToHDR(uint32_t width, uint32_t height, uint32_t samplesPerPixel, std::string imagePath, uint32_t seed)
-{
-    if (renderToHDRDeprecatedShown == false) {
-        std::cout<<"Warning, render_to_hdr is deprecated and will be removed in a subsequent release. Please switch to render_to_file." << std::endl;
-        renderToHDRDeprecatedShown = true;
-    }
-
-    std::vector<float> fb = render(width, height, samplesPerPixel, seed);
-    stbi_flip_vertically_on_write(true);
-    stbi_write_hdr(imagePath.c_str(), width, height, /* num channels*/ 4, fb.data());
-}
-
 float linearToSRGB(float x) {
     if (x <= 0.0031308f) {
 		return 12.92f * x;
@@ -2248,36 +2237,6 @@ vec3 Uncharted2Tonemap(vec3 x)
 	return max(vec3(0.0f), ((x*(A*x+C*B)+D*E_)/(x*(A*x+B)+D*F))-E_/F);
 }
 
-static bool renderToPNGDeprecatedShown = false;
-void renderToPNG(uint32_t width, uint32_t height, uint32_t samplesPerPixel, std::string imagePath, uint32_t seed)
-{
-    if (renderToPNGDeprecatedShown == false) {
-        std::cout<<"Warning, render_to_png is deprecated and will be removed in a subsequent release. Please switch to render_to_file." << std::endl;
-        renderToPNGDeprecatedShown = true;
-    }
-
-    // float exposure = 2.f; // TODO: expose as a parameter
-
-    std::vector<float> fb = render(width, height, samplesPerPixel, seed);
-    std::vector<uint8_t> colors(4 * width * height);
-    for (size_t i = 0; i < (width * height); ++i) {     
-        vec3 color = vec3(fb[i * 4 + 0], fb[i * 4 + 1], fb[i * 4 + 2]);
-        float alpha = fb[i * 4 + 3];
-
-        // color = Uncharted2Tonemap(color * exposure);
-        // color = color * (1.0f / Uncharted2Tonemap(vec3(11.2f)));
-
-        color = glm::convertLinearToSRGB(color);
-
-        colors[i * 4 + 0] = uint8_t(glm::clamp(color.r * 255.f, 0.f, 255.f));
-        colors[i * 4 + 1] = uint8_t(glm::clamp(color.g * 255.f, 0.f, 255.f));
-        colors[i * 4 + 2] = uint8_t(glm::clamp(color.b * 255.f, 0.f, 255.f));
-        colors[i * 4 + 3] = uint8_t(glm::clamp(alpha * 255.f, 0.f, 255.f));
-    }
-    stbi_flip_vertically_on_write(true);
-    stbi_write_png(imagePath.c_str(), width, height, /* num channels*/ 4, colors.data(), /* stride in bytes */ width * 4);
-}
-
 void renderToFile(uint32_t width, uint32_t height, uint32_t samplesPerPixel, std::string imagePath, uint32_t seed)
 {
     std::vector<float> fb = render(width, height, samplesPerPixel, seed);
@@ -2394,12 +2353,15 @@ void initializeInteractive(
 
         int numGPUs = owlGetDeviceCount(OptixData.context);
 
+        glEnable(GL_BLEND);
+        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+
         while (!stopped)
         {
             /* Poll events from the window */
             glfw->poll_events();
             glfw->swap_buffers("NVISII");
-            glClearColor(1,1,1,1);
+            glClearColor(0,0,0,0);
             glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
             if (NVISII.callback && NVISII.callbackMutex.try_lock()) {

From 52b4655105924dd5280162f83c813b605243fae8 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 27 Oct 2021 15:33:32 -0600
Subject: [PATCH 30/55] removing lazy_updates from initialize. Removing
 deprecated initialize_headless and initialize_interactive methods. Improving
 functionality of enable_updates and disable_updates

---
 ...15.camera_motion_car_blur.py => 15.car.py} | 28 +++------
 include/nvisii/nvisii.h                       | 32 ----------
 src/nvisii/nvisii.cpp                         | 63 +++++++++----------
 3 files changed, 39 insertions(+), 84 deletions(-)
 rename examples/{15.camera_motion_car_blur.py => 15.car.py} (90%)

diff --git a/examples/15.camera_motion_car_blur.py b/examples/15.car.py
similarity index 90%
rename from examples/15.camera_motion_car_blur.py
rename to examples/15.car.py
index 4afab621..0f3a7746 100644
--- a/examples/15.camera_motion_car_blur.py
+++ b/examples/15.car.py
@@ -21,13 +21,9 @@
 dome = nvisii.texture.create_from_file("dome", "content/teatro_massimo_2k.hdr")
 
 # we can add HDR images to act as dome
-nvisii.set_dome_light_texture(dome)
+nvisii.set_dome_light_texture(dome, enable_cdf=True)
 nvisii.set_dome_light_rotation(nvisii.angleAxis(nvisii.pi() * .5, nvisii.vec3(0, 0, 1)))
 
-car_speed = 0
-car_speed_x = car_speed
-car_speed_y = -2 * car_speed
-
 camera_height = 80
 # # # # # # # # # # # # # # # # # # # # # # # # #
 
@@ -46,20 +42,9 @@
 camera.get_transform().look_at(
     at = nvisii.vec3(-50,0,camera_height) , # look at (world coordinate)
     up = nvisii.vec3(0,0,1), # up vector
-    eye = nvisii.vec3(-500,500,100 + camera_height),
-    previous = False
-)
-
-camera.get_transform().look_at(
-    at = nvisii.vec3(-50,0,camera_height) + nvisii.vec3(car_speed_x, car_speed_y, .0) , # look at (world coordinate)
-    up = nvisii.vec3(0,0,1), # up vector
-    eye = nvisii.vec3(-500,500,100 + camera_height),
-    previous = True
+    eye = nvisii.vec3(-500,500,100 + camera_height)
 )
 
-camera.get_camera().set_aperture_diameter(5000)
-camera.get_camera().set_focal_distance(500)
-
 nvisii.set_camera_entity(camera)
 
 floor = nvisii.entity.create(
@@ -71,7 +56,7 @@
 floor.get_transform().set_scale(nvisii.vec3(10000))
 floor.get_transform().set_position(nvisii.vec3(0, 0, -5))
 floor.get_material().set_base_color(nvisii.vec3(1.0))
-floor.get_material().set_roughness(1)
+floor.get_material().set_roughness(0)
 floor.get_material().set_specular(0)
 
 # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -102,8 +87,6 @@
     # print(s.get_name())
     # if 'car' in s.get_name():
     #     print(s.get_name())
-    s.get_transform().set_linear_velocity(nvisii.vec3(car_speed_x, car_speed_y, .0))
-
     print(s.get_name())
     if "carshell" in s.get_name().lower():
         s.get_material().set_clearcoat(1)
@@ -137,6 +120,11 @@
         s.get_material().set_transmission(1)
         s.get_material().set_roughness(0)
         s.get_material().set_metallic(0)
+        s.set_visibility(shadow = False)
+    
+    if "interior" in s.get_name().lower():
+        s.get_material().set_base_color((1,1,1))
+
     # elif 'light' in s.get_name().lower():
     #     print(s.get_name())
     #     s.set_light(nvisii.light.create('light' + str(i_s)))
diff --git a/include/nvisii/nvisii.h b/include/nvisii/nvisii.h
index 3a58f2ed..54e1caf5 100644
--- a/include/nvisii/nvisii.h
+++ b/include/nvisii/nvisii.h
@@ -10,42 +10,11 @@
 
 namespace nvisii {
 
-/**
-  * Deprecated. Please use initialize() instead.
-*/
-void initializeInteractive(
-  bool window_on_top = false, 
-  bool verbose = false,
-  uint32_t max_entities = 10000,
-  uint32_t max_cameras = 10,
-  uint32_t max_transforms = 10000,
-  uint32_t max_meshes = 10000,
-  uint32_t max_materials = 10000,
-  uint32_t max_lights = 100,
-  uint32_t max_textures = 1000,
-  uint32_t max_volumes = 1000);
-
-/**
-  * Deprecated. Please use initialize(headless = True) instead.
-*/
-void initializeHeadless(
-  bool verbose = false,
-  uint32_t max_entities = 10000,
-  uint32_t max_cameras = 10,
-  uint32_t max_transforms = 10000,
-  uint32_t max_meshes = 10000,
-  uint32_t max_materials = 10000,
-  uint32_t max_lights = 100,
-  uint32_t max_textures = 1000,
-  uint32_t max_volumes = 1000);
-
 /**
   * Initializes various backend systems required to render scene data.
   * 
   * @param headless If true, avoids using any OpenGL resources, to enable use on systems without displays.
   * @param window_on_top Keeps the window opened during an interactive session on top of any other windows. (assuming headless is False)
-  * @param lazy_updates If True, nvisii will only upload components to the GPU on call to 
-  * render/render_to_png/render_data for better scene editing performance. (assuming headless is False. Always on when headless is True)
   * @param verbose If false, nvisii will avoid outputing any unneccessary text
   * @param max_entities The max number of creatable Entity components.
   * @param max_cameras The max number of creatable Camera components.
@@ -58,7 +27,6 @@ void initializeHeadless(
 void initialize(
   bool headless = false, 
   bool window_on_top = false, 
-  bool lazy_updates = false, 
   bool verbose = false,
   uint32_t max_entities = 10000,
   uint32_t max_cameras = 10,
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 2b956277..2abfc2a1 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -61,7 +61,7 @@ std::promise<void> exitSignal;
 std::thread renderThread;
 static bool initialized = false;
 static bool stopped = true;
-static bool lazyUpdatesEnabled = false;
+static bool paused = false;
 static bool verbose = true;
 
 static struct WindowData {
@@ -2360,20 +2360,21 @@ void initializeInteractive(
         {
             /* Poll events from the window */
             glfw->poll_events();
-            glfw->swap_buffers("NVISII");
-            glClearColor(0,0,0,0);
-            glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+            
+            if (!paused) {
+                glfw->swap_buffers("NVISII");
+                glClearColor(0,0,0,0);
+                glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
-            if (NVISII.callback && NVISII.callbackMutex.try_lock()) {
-                NVISII.callback();
-                NVISII.callbackMutex.unlock();
-            }
+                if (NVISII.callback && NVISII.callbackMutex.try_lock()) {
+                    NVISII.callback();
+                    NVISII.callbackMutex.unlock();
+                }
 
-            static double start=0;
-            static double stop=0;
-            start = glfwGetTime();
+                static double start=0;
+                static double stop=0;
+                start = glfwGetTime();
 
-            if (!lazyUpdatesEnabled) {
                 updateFrameBuffer();
                 updateComponents();
                 updateLaunchParams();
@@ -2394,24 +2395,25 @@ void initializeInteractive(
                 if (OptixData.enableDenoiser) {
                     denoiseImage();
                 }
+                // glm::vec4* samplePtr = (glm::vec4*) owlBufferGetPointer(OptixData.accumBuffer,0);
+                // glm::vec4* mvecPtr = (glm::vec4*) owlBufferGetPointer(OptixData.mvecBuffer,0);
+                // glm::vec4* t0AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.scratchBuffer,0);
+                // glm::vec4* t1AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.albedoBuffer,0);
+                // glm::vec4* fbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
+                // glm::vec4* sPtr = (glm::vec4*) owlBufferGetPointer(OptixData.normalBuffer,0);
+                // int width = OptixData.LP.frameSize.x;
+                // int height = OptixData.LP.frameSize.y;
+                // reproject(samplePtr, t0AlbPtr, t1AlbPtr, mvecPtr, sPtr, fbPtr, width, height);
+
+                drawFrameBufferToWindow();
+                stop = glfwGetTime();
+                glfwSetWindowTitle(WindowData.window, std::to_string(1.f / (stop - start)).c_str());
+                drawGUI();
             }
-            // glm::vec4* samplePtr = (glm::vec4*) owlBufferGetPointer(OptixData.accumBuffer,0);
-            // glm::vec4* mvecPtr = (glm::vec4*) owlBufferGetPointer(OptixData.mvecBuffer,0);
-            // glm::vec4* t0AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.scratchBuffer,0);
-            // glm::vec4* t1AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.albedoBuffer,0);
-            // glm::vec4* fbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
-            // glm::vec4* sPtr = (glm::vec4*) owlBufferGetPointer(OptixData.normalBuffer,0);
-            // int width = OptixData.LP.frameSize.x;
-            // int height = OptixData.LP.frameSize.y;
-            // reproject(samplePtr, t0AlbPtr, t1AlbPtr, mvecPtr, sPtr, fbPtr, width, height);
-
-            drawFrameBufferToWindow();
-            stop = glfwGetTime();
-            glfwSetWindowTitle(WindowData.window, std::to_string(1.f / (stop - start)).c_str());
-            drawGUI();
 
             processCommandQueue();
             checkForErrors();
+
             if (stopped) break;
         }
 
@@ -2496,7 +2498,6 @@ void initializeHeadless(
 void initialize(
     bool headless, 
     bool windowOnTop, 
-    bool _lazyUpdatesEnabled, 
     bool verbose,
     uint32_t maxEntities,
     uint32_t maxCameras,
@@ -2507,12 +2508,10 @@ void initialize(
     uint32_t maxTextures,
     uint32_t maxVolumes) 
 {
-    lazyUpdatesEnabled = _lazyUpdatesEnabled;
     // prevents deprecated warning from showing
     initializeInteractiveDeprecatedShown = true;
     initializeHeadlessDeprecatedShown = true;
 
-    lazyUpdatesEnabled = _lazyUpdatesEnabled;
     if (headless) 
         initializeHeadless(
             verbose, maxEntities, maxCameras, maxTransforms, maxMeshes, 
@@ -2584,17 +2583,17 @@ void updateSceneAabb(Entity* entity)
 
 void enableUpdates()
 {
-    enqueueCommandAndWait([] () { lazyUpdatesEnabled = false; });
+    enqueueCommandAndWait([] () { paused = false; });
 }
 
 void disableUpdates()
 {
-    enqueueCommandAndWait([] () { lazyUpdatesEnabled = true; });
+    enqueueCommandAndWait([] () { paused = true; });
 }
 
 bool areUpdatesEnabled()
 {
-    return lazyUpdatesEnabled == false;
+    return paused == false;
 }
 
 #ifdef __unix__

From f1762f389aa623a0134fc8822ec286a3e6f76f64 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 27 Oct 2021 15:50:14 -0600
Subject: [PATCH 31/55] improving the performance of transform updates

---
 include/nvisii/transform.h |  9 -----
 src/nvisii/transform.cpp   | 67 +++++++-------------------------------
 2 files changed, 12 insertions(+), 64 deletions(-)

diff --git a/include/nvisii/transform.h b/include/nvisii/transform.h
index cdf3640f..65559326 100644
--- a/include/nvisii/transform.h
+++ b/include/nvisii/transform.h
@@ -78,15 +78,6 @@ class Transform : public StaticFactory
     static std::vector<TransformStruct> transformStructs;
     static std::map<std::string, uint32_t> lookupTable;
     
-    /* Updates cached rotation values */
-    void updateRotation();
-
-    /* Updates cached position values */
-    void updatePosition();
-
-    /* Updates cached scale values */
-    void updateScale();
-
     /* Updates cached final local to parent matrix values */
     void updateMatrix();
 
diff --git a/src/nvisii/transform.cpp b/src/nvisii/transform.cpp
index 96950ce5..754eda1c 100644
--- a/src/nvisii/transform.cpp
+++ b/src/nvisii/transform.cpp
@@ -423,7 +423,7 @@ void Transform::setRotation(quat newRotation, bool previous)
 	if (previous) useRelativeAngularMotionBlur = false;
 	auto &r = (previous) ? prevRotation : rotation;
 	r = glm::normalize(newRotation);
-	updateRotation();
+	updateMatrix();
 	markDirty();
 }
 
@@ -436,7 +436,7 @@ void Transform::addRotation(quat additionalRotation, bool previous)
 {
 	if (previous) useRelativeAngularMotionBlur = false;
 	setRotation(getRotation(previous) * additionalRotation, previous);
-	updateRotation();
+	updateMatrix();
 	markDirty();
 }
 
@@ -445,21 +445,6 @@ void Transform::addAngleAxis(float angle, vec3 axis, bool previous)
 	addRotation(glm::angleAxis(angle, axis), previous);
 }
 
-void Transform::updateRotation()
-{
-	// localToParentRotation = glm::toMat4(rotation);
-	// parentToLocalRotation = glm::inverse(localToParentRotation);
-
-	// if (useRelativeMotionBlur) {
-	// 	prevLocalToParentRotation = glm::toMat4(angularVelocity * rotation);
-	// } else {
-	// 	prevLocalToParentRotation = glm::toMat4(prevRotation);
-	// }
-	// prevParentToLocalRotation = glm::inverse(prevLocalToParentRotation);
-	updateMatrix();
-	markDirty();
-}
-
 vec3 Transform::getPosition(bool previous)
 {
 	if (previous) return prevPosition;
@@ -513,7 +498,7 @@ void Transform::setPosition(vec3 newPosition, bool previous)
 	if (previous) useRelativeLinearMotionBlur = false;
 	auto &p = (previous) ? prevPosition : position;
 	p = newPosition;
-	updatePosition();
+	updateMatrix();
 	markDirty();
 }
 
@@ -521,7 +506,7 @@ void Transform::addPosition(vec3 additionalPosition, bool previous)
 {
 	if (previous) useRelativeLinearMotionBlur = false;
 	setPosition(getPosition(previous) + additionalPosition, previous);
-	updatePosition();
+	updateMatrix();
 	markDirty();
 }
 
@@ -531,7 +516,7 @@ void Transform::setLinearVelocity(vec3 newLinearVelocity, float framesPerSecond,
 	mix = glm::clamp(mix, 0.f, 1.f);
 	newLinearVelocity /= framesPerSecond;
 	linearMotion = glm::mix(newLinearVelocity, linearMotion, mix);
-	updatePosition();
+	updateMatrix();
 	markDirty();
 }
 
@@ -543,7 +528,7 @@ void Transform::setAngularVelocity(quat newAngularVelocity, float framesPerSecon
 	newAngularVelocity[1] = newAngularVelocity[1] / framesPerSecond;
 	newAngularVelocity[2] = newAngularVelocity[2] / framesPerSecond;
 	angularMotion = glm::lerp(newAngularVelocity, angularMotion, mix);
-	updateRotation();
+	updateMatrix();
 	markDirty();
 }
 
@@ -553,7 +538,7 @@ void Transform::setScalarVelocity(vec3 newScalarVelocity, float framesPerSecond,
 	mix = glm::clamp(mix, 0.f, 1.f);
 	newScalarVelocity /= framesPerSecond;
 	scalarMotion = glm::mix(newScalarVelocity, scalarMotion, mix);
-	updateScale();
+	updateMatrix();
 	markDirty();
 }
 
@@ -581,21 +566,6 @@ void Transform::clearMotion()
 // 	markDirty();
 // }
 
-void Transform::updatePosition()
-{
-	// localToParentTranslation = glm::translate(glm::mat4(1.0), position);
-	// parentToLocalTranslation = glm::translate(glm::mat4(1.0), -position);
-	// if (useRelativeMotionBlur) {
-	// 	prevLocalToParentTranslation = glm::translate(glm::mat4(1.0), position + linearVelocity);
-	// 	prevParentToLocalTranslation = glm::translate(glm::mat4(1.0), -position + linearVelocity);
-	// } else {
-	// 	prevLocalToParentTranslation = glm::translate(glm::mat4(1.0), prevPosition);
-	// 	prevParentToLocalTranslation = glm::translate(glm::mat4(1.0), -prevPosition);
-	// }
-	updateMatrix();
-	markDirty();
-}
-
 vec3 Transform::getScale(bool previous)
 {
 	if (previous) return prevScale;
@@ -607,14 +577,14 @@ void Transform::setScale(vec3 newScale, bool previous)
 	if (previous) useRelativeScalarMotionBlur = false;
 	auto &s = (previous) ? prevScale : scale;
 	s = newScale;
-	updateScale();
+	updateMatrix();
 	markDirty();
 }
 
 // void Transform::setScale(float newScale)
 // {
 // 	scale = vec3(newScale, newScale, newScale);
-// 	updateScale();
+// 	updateMatrix();
 // 	markDirty();
 // }
 
@@ -622,7 +592,7 @@ void Transform::addScale(vec3 additionalScale, bool previous)
 {
 	if (previous) useRelativeScalarMotionBlur = false;
 	setScale(getScale(previous) + additionalScale, previous);
-	updateScale();
+	updateMatrix();
 	markDirty();
 }
 
@@ -644,21 +614,6 @@ void Transform::addScale(vec3 additionalScale, bool previous)
 // 	markDirty();
 // }
 
-void Transform::updateScale()
-{
-	// localToParentScale = glm::scale(glm::mat4(1.0), scale);
-	// parentToLocalScale = glm::scale(glm::mat4(1.0), glm::vec3(1.0 / scale.x, 1.0 / scale.y, 1.0 / scale.z));
-	// if (useRelativeMotionBlur) {
-	// 	prevLocalToParentScale = glm::scale(glm::mat4(1.0), scale + scalarVelocity);
-	// 	prevParentToLocalScale = glm::scale(glm::mat4(1.0), glm::vec3(1.0 / (scale.x + scalarVelocity.x), 1.0 / (scale.y + scalarVelocity.y), 1.0 / (scale.z + scalarVelocity.z)));
-	// } else {
-	// 	prevLocalToParentScale = glm::scale(glm::mat4(1.0), prevScale);
-	// 	prevParentToLocalScale = glm::scale(glm::mat4(1.0), glm::vec3(1.0 / prevScale.x, 1.0 / prevScale.y, 1.0 / prevScale.z));
-	// }
-	updateMatrix();
-	markDirty();
-}
-
 void Transform::updateMatrix()
 {
 	localToParentMatrix = (localToParentTransform * getLocalToParentTranslationMatrix(false) * getLocalToParentRotationMatrix(false) * getLocalToParentScaleMatrix(false));
@@ -953,6 +908,8 @@ glm::mat4 Transform::getLocalToWorldMatrix(bool previous) {
 
 void Transform::updateChildren()
 {
+	if (children.size() == 0) return;
+
 	for (auto &c : children) {
 		auto &t = transforms[c];
 		t.updateChildren();

From ded9a621a82181613d91e8133745120d4efe02b9 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 27 Oct 2021 15:53:00 -0600
Subject: [PATCH 32/55] adding back world matrix calculation

---
 src/nvisii/transform.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/nvisii/transform.cpp b/src/nvisii/transform.cpp
index 754eda1c..a31e9c10 100644
--- a/src/nvisii/transform.cpp
+++ b/src/nvisii/transform.cpp
@@ -908,8 +908,6 @@ glm::mat4 Transform::getLocalToWorldMatrix(bool previous) {
 
 void Transform::updateChildren()
 {
-	if (children.size() == 0) return;
-
 	for (auto &c : children) {
 		auto &t = transforms[c];
 		t.updateChildren();

From 6ec79acf0a9979336a29efee81a1f8d44586553b Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 27 Oct 2021 17:51:45 -0600
Subject: [PATCH 33/55] merging the two initialize loops into one, in the
 future this will make it easier to catch bugs between headless mode vs
 interactive.

---
 examples/02.random_scene.py         |   1 -
 examples/03.pybullet.py             |   4 +-
 examples/09.meta_data_exporting.py  |   2 +-
 examples/12.pybullet_motion_blur.py |   2 +-
 include/nvisii/transform.h          |   8 +-
 src/nvisii/nvisii.cpp               | 201 ++++++++--------------------
 src/nvisii/transform.cpp            |  96 ++++---------
 7 files changed, 89 insertions(+), 225 deletions(-)

diff --git a/examples/02.random_scene.py b/examples/02.random_scene.py
index 370b8e12..bd9d2bde 100644
--- a/examples/02.random_scene.py
+++ b/examples/02.random_scene.py
@@ -21,7 +21,6 @@
 nvisii.initialize(
     headless = True, 
     verbose = True, 
-    lazy_updates = True,
     max_entities = opt.nb_objs + 1,
     max_transforms = opt.nb_objs + 1,  
     max_materials = opt.nb_objs,
diff --git a/examples/03.pybullet.py b/examples/03.pybullet.py
index f9dfa71d..9caba83c 100644
--- a/examples/03.pybullet.py
+++ b/examples/03.pybullet.py
@@ -26,8 +26,8 @@
     print(f'created folder {opt.outf}/')
 # # # # # # # # # # # # # # # # # # # # # # # # #
 
-# show an interactive window, and use "lazy" updates for faster object creation time 
-nvisii.initialize(headless=False, lazy_updates=True)
+# show an interactive window
+nvisii.initialize(headless=False)
 
 if not opt.noise is True: 
     nvisii.enable_denoiser()
diff --git a/examples/09.meta_data_exporting.py b/examples/09.meta_data_exporting.py
index bf464cbd..07777490 100644
--- a/examples/09.meta_data_exporting.py
+++ b/examples/09.meta_data_exporting.py
@@ -17,7 +17,7 @@
     print(f'created folder {opt.outf}/')
 # # # # # # # # # # # # # # # # # # # # # # # # #
 
-nvisii.initialize(headless=False, verbose=True, lazy_updates = True)
+nvisii.initialize(headless=False, verbose=True)
 
 nvisii.enable_denoiser()
 
diff --git a/examples/12.pybullet_motion_blur.py b/examples/12.pybullet_motion_blur.py
index 121fcaae..3b85e5c5 100644
--- a/examples/12.pybullet_motion_blur.py
+++ b/examples/12.pybullet_motion_blur.py
@@ -25,7 +25,7 @@
     print(f'created folder {opt.outf}/')
 # # # # # # # # # # # # # # # # # # # # # # # # #
 
-nvisii.initialize(headless = False, lazy_updates=True)
+nvisii.initialize(headless = False)
 
 if not opt.noise is True: 
     nvisii.enable_denoiser()
diff --git a/include/nvisii/transform.h b/include/nvisii/transform.h
index 65559326..787c344d 100644
--- a/include/nvisii/transform.h
+++ b/include/nvisii/transform.h
@@ -78,15 +78,9 @@ class Transform : public StaticFactory
     static std::vector<TransformStruct> transformStructs;
     static std::map<std::string, uint32_t> lookupTable;
     
-    /* Updates cached final local to parent matrix values */
+    /* Updates cached final local to parent and local to world matrix values */
     void updateMatrix();
 
-    /* Updates cached final local to world matrix values */
-    void updateWorldMatrix();
-
-    /* updates all childrens cached final local to world matrix values */
-    void updateChildren();
-
     /* updates the struct for this transform which can be uploaded to the GPU. */
     void updateStruct();
     
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index 2abfc2a1..bc8ab326 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -1143,6 +1143,15 @@ void updateComponents()
     std::lock_guard<std::recursive_mutex> texture_lock(Texture::areAnyDirty()     ? *Texture::getEditMutex().get() : dummyMutex);
     std::lock_guard<std::recursive_mutex> volume_lock(Volume::areAnyDirty()       ? *Volume::getEditMutex().get() : dummyMutex);
 
+    // Manage transforms
+    auto dirtyTransforms = Transform::getDirtyTransforms();
+    if (dirtyTransforms.size() > 0) {
+        Transform::updateComponents();
+
+        // cudaSetDevice(0);
+        owlBufferUpload(OptixData.transformBuffer, Transform::getFrontStruct());
+    }  
+
     // Manage Meshes: Build / Rebuild BLAS
     auto dirtyMeshes = Mesh::getDirtyMeshes();
     if (dirtyMeshes.size() > 0) {
@@ -1489,15 +1498,6 @@ void updateComponents()
         owlBufferUpload(OptixData.textureBuffer, OptixData.textureStructs.data());
     }
     
-    // Manage transforms
-    auto dirtyTransforms = Transform::getDirtyTransforms();
-    if (dirtyTransforms.size() > 0) {
-        Transform::updateComponents();
-
-        // cudaSetDevice(0);
-        owlBufferUpload(OptixData.transformBuffer, Transform::getFrontStruct());
-    }   
-
     // Manage Cameras
     if (Camera::areAnyDirty()) {
         Camera::updateComponents();
@@ -2306,10 +2306,8 @@ void initializeComponentFactories(
 
 void reproject(glm::vec4 *samplesBuffer, glm::vec4 *t0AlbedoBuffer, glm::vec4 *t1AlbedoBuffer, glm::vec4 *mvecBuffer, glm::vec4 *scratchBuffer, glm::vec4 *imageBuffer, int width, int height);
 
-
-static bool initializeInteractiveDeprecatedShown = false;
-static bool initializeHeadlessDeprecatedShown = false;
-void initializeInteractive(
+void initialize(
+    bool headless, 
     bool windowOnTop, 
     bool _verbose,
     uint32_t maxEntities,
@@ -2319,13 +2317,8 @@ void initializeInteractive(
     uint32_t maxMaterials,
     uint32_t maxLights,
     uint32_t maxTextures,
-    uint32_t maxVolumes)
+    uint32_t maxVolumes) 
 {
-    if (initializeInteractiveDeprecatedShown == false) {
-        std::cout<<"Warning, initialize_interactive is deprecated and will be removed in a subsequent release. Please switch to initialize." << std::endl;
-        initializeInteractiveDeprecatedShown = true;
-    }
-
     // don't initialize more than once
     if (initialized == true) {
         throw std::runtime_error("Error: already initialized!");
@@ -2335,56 +2328,67 @@ void initializeInteractive(
     stopped = false;
     verbose = _verbose;
     NVISII.callback = nullptr;
+    NVISII.headlessMode = headless;
 
     initializeComponentFactories(maxEntities, maxCameras, maxTransforms, maxMeshes, maxMaterials, maxLights, maxTextures, maxVolumes);
 
     auto loop = [windowOnTop]() {
         NVISII.render_thread_id = std::this_thread::get_id();
-        NVISII.headlessMode = false;
+        Libraries::GLFW *glfw = nullptr;
 
-        auto glfw = Libraries::GLFW::Get();
-        WindowData.window = glfw->create_window("NVISII", 512, 512, windowOnTop, true, true);
-        WindowData.currentSize = WindowData.lastSize = ivec2(512, 512);
-        glfw->make_context_current("NVISII");
-        glfw->poll_events();
+        if (!NVISII.headlessMode) 
+        {
+            glfw = Libraries::GLFW::Get();
+            WindowData.window = glfw->create_window("NVISII", 512, 512, windowOnTop, true, true);
+            WindowData.currentSize = WindowData.lastSize = ivec2(512, 512);
+            glfw->make_context_current("NVISII");
+            glfw->poll_events();
+        }
 
-        initializeOptix(/*headless = */ false);
-        initializeImgui();
+        initializeOptix(/*headless = */ NVISII.headlessMode);
 
         int numGPUs = owlGetDeviceCount(OptixData.context);
 
-        glEnable(GL_BLEND);
-        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+        if (!NVISII.headlessMode) {
+            initializeImgui();
+            glEnable(GL_BLEND);
+            glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+        } 
 
         while (!stopped)
         {
-            /* Poll events from the window */
-            glfw->poll_events();
-            
             if (!paused) {
-                glfw->swap_buffers("NVISII");
-                glClearColor(0,0,0,0);
-                glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+                if (!NVISII.headlessMode) {
+                    /* Poll events from the window */
+                    glfw->poll_events();
+                    glfw->swap_buffers("NVISII");
+                        glClearColor(0,0,0,0);
+                        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+                }
 
                 if (NVISII.callback && NVISII.callbackMutex.try_lock()) {
                     NVISII.callback();
                     NVISII.callbackMutex.unlock();
                 }
-
+                
                 static double start=0;
                 static double stop=0;
-                start = glfwGetTime();
 
-                updateFrameBuffer();
+                if (!NVISII.headlessMode) {
+                    start = glfwGetTime();
+                    updateFrameBuffer();
+                }
+                
                 updateComponents();
                 updateLaunchParams();
-
+                
                 for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
                     cudaSetDevice(deviceID);
                     cudaEventRecord(NVISII.events[deviceID].first, owlParamsGetCudaStream(OptixData.launchParams, deviceID));
                     owlAsyncLaunch2DOnDevice(OptixData.rayGen, OptixData.LP.frameSize.x * OptixData.LP.frameSize.y, 1, deviceID, OptixData.launchParams);
                     cudaEventRecord(NVISII.events[deviceID].second, owlParamsGetCudaStream(OptixData.launchParams, deviceID));
                 }
+
                 owlLaunchSync(OptixData.launchParams);
                 for (uint32_t deviceID = 0; deviceID < numGPUs; deviceID++) {
                     cudaEventElapsedTime(&NVISII.times[deviceID], NVISII.events[deviceID].first, NVISII.events[deviceID].second);
@@ -2395,96 +2399,36 @@ void initializeInteractive(
                 if (OptixData.enableDenoiser) {
                     denoiseImage();
                 }
-                // glm::vec4* samplePtr = (glm::vec4*) owlBufferGetPointer(OptixData.accumBuffer,0);
-                // glm::vec4* mvecPtr = (glm::vec4*) owlBufferGetPointer(OptixData.mvecBuffer,0);
-                // glm::vec4* t0AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.scratchBuffer,0);
-                // glm::vec4* t1AlbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.albedoBuffer,0);
-                // glm::vec4* fbPtr = (glm::vec4*) owlBufferGetPointer(OptixData.frameBuffer,0);
-                // glm::vec4* sPtr = (glm::vec4*) owlBufferGetPointer(OptixData.normalBuffer,0);
-                // int width = OptixData.LP.frameSize.x;
-                // int height = OptixData.LP.frameSize.y;
-                // reproject(samplePtr, t0AlbPtr, t1AlbPtr, mvecPtr, sPtr, fbPtr, width, height);
 
-                drawFrameBufferToWindow();
-                stop = glfwGetTime();
-                glfwSetWindowTitle(WindowData.window, std::to_string(1.f / (stop - start)).c_str());
-                drawGUI();
+                if (!NVISII.headlessMode) {
+                    drawFrameBufferToWindow();
+                    stop = glfwGetTime();
+                    glfwSetWindowTitle(WindowData.window, std::to_string(1.f / (stop - start)).c_str());
+                    drawGUI();            
+                }
             }
 
             processCommandQueue();
             checkForErrors();
-
             if (stopped) break;
         }
 
         if (OptixData.denoiser)
             OPTIX_CHECK(optixDenoiserDestroy(OptixData.denoiser));
 
-        if (OptixData.imageTexID != -1) {
-            if (OptixData.cudaResourceTex) {
-                cudaGraphicsUnregisterResource(OptixData.cudaResourceTex);
-                OptixData.cudaResourceTex = 0;
+        if (!NVISII.headlessMode) {
+            if (OptixData.imageTexID != -1) {
+                if (OptixData.cudaResourceTex) {
+                    cudaGraphicsUnregisterResource(OptixData.cudaResourceTex);
+                    OptixData.cudaResourceTex = 0;
+                }
+                glDeleteTextures(1, &OptixData.imageTexID);
             }
-            glDeleteTextures(1, &OptixData.imageTexID);
-        }
-
-        ImGui::DestroyContext();
-        if (glfw->does_window_exist("NVISII")) glfw->destroy_window("NVISII");
-
-        owlContextDestroy(OptixData.context);
-    };
-
-    renderThread = std::thread(loop);
 
-    // Waits for the render thread to start before returning
-    enqueueCommandAndWait([] () {});
-}
-
-void initializeHeadless(
-    bool _verbose, 
-    uint32_t maxEntities,
-    uint32_t maxCameras,
-    uint32_t maxTransforms,
-    uint32_t maxMeshes,
-    uint32_t maxMaterials,
-    uint32_t maxLights,
-    uint32_t maxTextures,
-    uint32_t maxVolumes)
-{
-    if (initializeHeadlessDeprecatedShown == false) {
-        std::cout<<"Warning, initialize_headless is deprecated and will be removed in a subsequent release. Please switch to initialize(headless = True)." << std::endl;
-        initializeHeadlessDeprecatedShown = true;
-    }
-
-    // don't initialize more than once
-    if (initialized == true) {
-        throw std::runtime_error("Error: already initialized!");
-    }
-
-    initialized = true;
-    stopped = false;
-    verbose = _verbose;
-    NVISII.callback = nullptr;
-
-    initializeComponentFactories(maxEntities, maxCameras, maxTransforms, maxMeshes, maxMaterials, maxLights, maxTextures, maxVolumes);
-
-    auto loop = []() {
-        NVISII.render_thread_id = std::this_thread::get_id();
-        NVISII.headlessMode = true;
-
-        initializeOptix(/*headless = */ true);
-
-        while (!stopped)
-        {
-            if(NVISII.callback){
-                NVISII.callback();
-            }
-            processCommandQueue();
-            if (stopped) break;
+            ImGui::DestroyContext();
+            auto glfw = Libraries::GLFW::Get();
+            if (glfw->does_window_exist("NVISII")) glfw->destroy_window("NVISII");
         }
-
-        if (OptixData.denoiser)
-            OPTIX_CHECK(optixDenoiserDestroy(OptixData.denoiser));
         
         owlContextDestroy(OptixData.context);
     };
@@ -2495,33 +2439,6 @@ void initializeHeadless(
     enqueueCommandAndWait([] () {});
 }
 
-void initialize(
-    bool headless, 
-    bool windowOnTop, 
-    bool verbose,
-    uint32_t maxEntities,
-    uint32_t maxCameras,
-    uint32_t maxTransforms,
-    uint32_t maxMeshes,
-    uint32_t maxMaterials,
-    uint32_t maxLights,
-    uint32_t maxTextures,
-    uint32_t maxVolumes) 
-{
-    // prevents deprecated warning from showing
-    initializeInteractiveDeprecatedShown = true;
-    initializeHeadlessDeprecatedShown = true;
-
-    if (headless) 
-        initializeHeadless(
-            verbose, maxEntities, maxCameras, maxTransforms, maxMeshes, 
-            maxMaterials, maxLights, maxTextures, maxVolumes);
-    else 
-        initializeInteractive(
-            windowOnTop, verbose, maxEntities, maxCameras, maxTransforms, 
-            maxMeshes, maxMaterials, maxLights, maxTextures, maxVolumes);
-}
-
 static bool registerPreRenderCallbackDeprecatedShown = false;
 void registerPreRenderCallback(std::function<void()> callback){
     if (registerPreRenderCallbackDeprecatedShown == false) {
diff --git a/src/nvisii/transform.cpp b/src/nvisii/transform.cpp
index a31e9c10..0e1ec67a 100644
--- a/src/nvisii/transform.cpp
+++ b/src/nvisii/transform.cpp
@@ -42,6 +42,14 @@ std::set<Transform*> Transform::getDirtyTransforms()
 void Transform::updateComponents() 
 {
 	if (dirtyTransforms.size() == 0) return;
+	
+	// first, update transform matrices
+	for (auto &t : dirtyTransforms) {
+		if (!t->isInitialized()) continue;
+		t->updateMatrix();
+	}
+
+	// next, copy over the local to world matrices
 	for (auto &t : dirtyTransforms) {
 		if (!t->isInitialized()) continue;
 		transformStructs[t->id].localToWorld = t->getLocalToWorldMatrix(false);
@@ -299,7 +307,6 @@ void Transform::lookAt(vec3 at, vec3 up, vec3 eye, bool previous)
 // 	localToParentTranslation = glm::translate(glm::mat4(1.0), position);
 // 	parentToLocalTranslation = glm::translate(glm::mat4(1.0), -position);
 
-// 	updateMatrix();
 // 	markDirty();
 // }
 
@@ -326,7 +333,6 @@ void Transform::rotateAround(vec3 point, glm::quat rot, bool previous)
 	// ltpt = glm::translate(glm::mat4(1.0), t);
 	// ptlt = glm::translate(glm::mat4(1.0), -t);
 
-	updateMatrix();
 	markDirty();
 }
 
@@ -407,7 +413,6 @@ void Transform::setTransform(glm::mat4 transformation, bool decompose, bool prev
 			this->localToParentTransform = transformation;
 			// this->parentToLocalTransform = glm::inverse(transformation);
 		}
-		updateMatrix();
 	}
 	markDirty();
 }
@@ -423,7 +428,6 @@ void Transform::setRotation(quat newRotation, bool previous)
 	if (previous) useRelativeAngularMotionBlur = false;
 	auto &r = (previous) ? prevRotation : rotation;
 	r = glm::normalize(newRotation);
-	updateMatrix();
 	markDirty();
 }
 
@@ -436,7 +440,6 @@ void Transform::addRotation(quat additionalRotation, bool previous)
 {
 	if (previous) useRelativeAngularMotionBlur = false;
 	setRotation(getRotation(previous) * additionalRotation, previous);
-	updateMatrix();
 	markDirty();
 }
 
@@ -498,7 +501,6 @@ void Transform::setPosition(vec3 newPosition, bool previous)
 	if (previous) useRelativeLinearMotionBlur = false;
 	auto &p = (previous) ? prevPosition : position;
 	p = newPosition;
-	updateMatrix();
 	markDirty();
 }
 
@@ -506,7 +508,6 @@ void Transform::addPosition(vec3 additionalPosition, bool previous)
 {
 	if (previous) useRelativeLinearMotionBlur = false;
 	setPosition(getPosition(previous) + additionalPosition, previous);
-	updateMatrix();
 	markDirty();
 }
 
@@ -516,7 +517,6 @@ void Transform::setLinearVelocity(vec3 newLinearVelocity, float framesPerSecond,
 	mix = glm::clamp(mix, 0.f, 1.f);
 	newLinearVelocity /= framesPerSecond;
 	linearMotion = glm::mix(newLinearVelocity, linearMotion, mix);
-	updateMatrix();
 	markDirty();
 }
 
@@ -528,7 +528,6 @@ void Transform::setAngularVelocity(quat newAngularVelocity, float framesPerSecon
 	newAngularVelocity[1] = newAngularVelocity[1] / framesPerSecond;
 	newAngularVelocity[2] = newAngularVelocity[2] / framesPerSecond;
 	angularMotion = glm::lerp(newAngularVelocity, angularMotion, mix);
-	updateMatrix();
 	markDirty();
 }
 
@@ -538,7 +537,6 @@ void Transform::setScalarVelocity(vec3 newScalarVelocity, float framesPerSecond,
 	mix = glm::clamp(mix, 0.f, 1.f);
 	newScalarVelocity /= framesPerSecond;
 	scalarMotion = glm::mix(newScalarVelocity, scalarMotion, mix);
-	updateMatrix();
 	markDirty();
 }
 
@@ -550,7 +548,6 @@ void Transform::clearMotion()
 	scalarMotion = glm::vec3(0.f);
 	angularMotion = glm::quat(1.f, 0.f, 0.f, 0.f);
 	linearMotion = glm::vec3(0.f);
-	updateMatrix();
 	markDirty();
 }
 
@@ -577,14 +574,12 @@ void Transform::setScale(vec3 newScale, bool previous)
 	if (previous) useRelativeScalarMotionBlur = false;
 	auto &s = (previous) ? prevScale : scale;
 	s = newScale;
-	updateMatrix();
 	markDirty();
 }
 
 // void Transform::setScale(float newScale)
 // {
 // 	scale = vec3(newScale, newScale, newScale);
-// 	updateMatrix();
 // 	markDirty();
 // }
 
@@ -592,7 +587,6 @@ void Transform::addScale(vec3 additionalScale, bool previous)
 {
 	if (previous) useRelativeScalarMotionBlur = false;
 	setScale(getScale(previous) + additionalScale, previous);
-	updateMatrix();
 	markDirty();
 }
 
@@ -616,24 +610,30 @@ void Transform::addScale(vec3 additionalScale, bool previous)
 
 void Transform::updateMatrix()
 {
+	// Update the current transform matrices
 	localToParentMatrix = (localToParentTransform * getLocalToParentTranslationMatrix(false) * getLocalToParentRotationMatrix(false) * getLocalToParentScaleMatrix(false));
 	parentToLocalMatrix = (getParentToLocalScaleMatrix(false) * getParentToLocalRotationMatrix(false) * getParentToLocalTranslationMatrix(false) * glm::inverse(localToParentTransform));
 
 	prevLocalToParentMatrix = (prevLocalToParentTransform * getLocalToParentTranslationMatrix(true) * getLocalToParentRotationMatrix(true) * getLocalToParentScaleMatrix(true));
 	prevParentToLocalMatrix = (getParentToLocalScaleMatrix(true) * getParentToLocalRotationMatrix(true) * getParentToLocalTranslationMatrix(true) * glm::inverse(prevLocalToParentTransform));
 
-	// right = glm::vec3(localToParentMatrix[0]);
-	// up = glm::vec3(localToParentMatrix[1]);
-	// forward = glm::vec3(localToParentMatrix[2]);
-	// position = glm::vec3(localToParentMatrix[3]);
-
-	// prevRight = glm::vec3(prevLocalToParentMatrix[0]);
-	// prevUp = glm::vec3(prevLocalToParentMatrix[1]);
-	// prevForward = glm::vec3(prevLocalToParentMatrix[2]);
-	// prevPosition = glm::vec3(prevLocalToParentMatrix[3]);
+	if (parent == -1) {
+		worldToLocalMatrix = parentToLocalMatrix;
+		localToWorldMatrix = localToParentMatrix;
+		prevWorldToLocalMatrix = prevParentToLocalMatrix;
+		prevLocalToWorldMatrix = prevLocalToParentMatrix;
+	} else {
+		worldToLocalMatrix = computeWorldToLocalMatrix(/*previous=*/false);
+		prevWorldToLocalMatrix = computeWorldToLocalMatrix(/*previous=*/true);
+		localToWorldMatrix = glm::inverse(worldToLocalMatrix); 
+		prevLocalToWorldMatrix = glm::inverse(prevWorldToLocalMatrix); 
+	}
 
-	updateChildren();
-	markDirty();
+	// If this transform has children, update those too.
+	for (auto &c : children) {
+		auto &t = transforms[c];
+		t.updateMatrix();
+	}
 }
 
 glm::mat4 Transform::computeWorldToLocalMatrix(bool previous)
@@ -656,37 +656,6 @@ glm::mat4 Transform::computeWorldToLocalMatrix(bool previous)
 // 	else return getNextParentToLocalMatrix();
 // }
 
-void Transform::updateWorldMatrix()
-{
-	if (parent == -1) {
-		worldToLocalMatrix = parentToLocalMatrix;
-		localToWorldMatrix = localToParentMatrix;
-		prevWorldToLocalMatrix = prevParentToLocalMatrix;
-		prevLocalToWorldMatrix = prevLocalToParentMatrix;
-
-		// worldScale = scale;
-		// worldTranslation = position;
-		// worldRotation = rotation;
-		// worldSkew = glm::vec3(0.f, 0.f, 0.f);
-		// worldPerspective = glm::vec4(1.0f, 1.0f, 1.0f, 1.0f); // not sure what this should default to...
-
-		// prevWorldScale = prevScale;
-		// prevWorldTranslation = prevPosition;
-		// prevWorldRotation = prevRotation;
-		// prevWorldSkew = glm::vec3(0.f, 0.f, 0.f);
-		// prevWorldPerspective = glm::vec4(1.0f, 1.0f, 1.0f, 1.0f); // not sure what this should default to...
-	} else {
-		worldToLocalMatrix = computeWorldToLocalMatrix(/*previous=*/false);
-		prevWorldToLocalMatrix = computeWorldToLocalMatrix(/*previous=*/true);
-		localToWorldMatrix = glm::inverse(worldToLocalMatrix); 
-		prevLocalToWorldMatrix = glm::inverse(prevWorldToLocalMatrix); 
-		// glm::decompose(localToWorldMatrix, worldScale, worldRotation, worldTranslation, worldSkew, worldPerspective);
-		// glm::decompose(prevLocalToWorldMatrix, prevWorldScale, prevWorldRotation, prevWorldTranslation, prevWorldSkew, prevWorldPerspective);
-		// glm::decompose(nextLocalToWorldMatrix, worldScale, worldRotation, worldTranslation, worldSkew, worldPerspective);
-	}
-	markDirty();
-}
-
 glm::mat4 Transform::getParentToLocalMatrix(bool previous)
 {
 	if (previous) return prevParentToLocalMatrix;
@@ -787,7 +756,6 @@ void Transform::setParent(Transform *parent) {
 
 	this->parent = parent->getId();
 	transforms[parent->getId()].children.insert(this->id);
-	updateChildren();
 	markDirty();
 }
 
@@ -800,7 +768,6 @@ void Transform::clearParent()
 	
 	transforms[parent].children.erase(this->id);
 	this->parent = -1;
-	updateChildren();
 	markDirty();
 }
 
@@ -829,7 +796,6 @@ void Transform::removeChild(Transform *object) {
 
 	children.erase(object->getId());
 	transforms[object->getId()].parent = -1;
-	transforms[object->getId()].updateWorldMatrix();
 	transforms[object->getId()].markDirty();
 }
 
@@ -905,18 +871,6 @@ glm::mat4 Transform::getLocalToWorldMatrix(bool previous) {
 // 	return m;
 // }
 
-
-void Transform::updateChildren()
-{
-	for (auto &c : children) {
-		auto &t = transforms[c];
-		t.updateChildren();
-	}
-
-	updateWorldMatrix();
-	markDirty();
-}
-
 TransformStruct &Transform::getStruct()
 {
 	return transformStructs[id];

From 2c01f62f5a3891d2c923ba1301efe27a290dd54f Mon Sep 17 00:00:00 2001
From: jtremblay <tonton.tremblay@gmail.com>
Date: Fri, 10 Jun 2022 09:08:32 -0700
Subject: [PATCH 34/55] Update README.md

Forcing a CLI compile
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6c93964a..a86081cd 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ For more information see our [ICLR workshop 2021 paper](https://arxiv.org/abs/21
 
 [Documentation](https://nvisii.com).
 
+
 <!--
 This library provides a simple, primarily python-user targeted, interface to rendering images of a virtual scene. Its key cornerstones are:
 

From 8c12de09a36b295308e6080547bb246ab1e94189 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Tue, 5 Jul 2022 14:59:09 -0600
Subject: [PATCH 35/55] Update build.yml

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 992995fc..2fec48e8 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -25,7 +25,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, windows-2019]
-        python-version: ['3.6', '3.7', '3.8', '3.9']
+        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
         optix-version: [70, 71, 72]
         include:
           # Includes the value for matrix.container when the OS is Ubuntu, to use manylinux complaint CentOS version.

From bec651a0db9c4a5ce0ee75f3d02105bd471228c2 Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 6 Jul 2022 00:08:06 -0600
Subject: [PATCH 36/55] updating NVISII to latest version of owl

---
 CMakeLists.txt                       | 102 +++++++++------
 cmake/FindOptiX.cmake                | 178 ---------------------------
 externals/owl                        |   2 +-
 src/nvisii/devicecode/path_tracer.cu |   1 +
 src/nvisii/nvisii.cpp                |   4 +-
 5 files changed, 68 insertions(+), 219 deletions(-)
 delete mode 100644 cmake/FindOptiX.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b86d441..25e5a825 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,11 +3,12 @@
 # ┌──────────────────────────────────────────────────────────────────┐
 # │  Projects Settings                                               │
 # └──────────────────────────────────────────────────────────────────┘
-cmake_minimum_required (VERSION 3.13)
-cmake_policy( VERSION 3.13...3.14 )
-project(NVISII CXX C)
+cmake_minimum_required (VERSION 3.17)
+project(NVISII CXX C CUDA)
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+# set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/owl/owl/cmake")
+
 
 # Force 64 bit builds
 if(CMAKE_SIZEOF_VOID_P EQUAL 4)
@@ -94,7 +95,7 @@ set(MY_DEFINITION
 # └──────────────────────────────────────────────────────────────────┘
 
 # Build options go here... Things like "Build Tests", or "Generate documentation"...
-option(NVCC_VERBOSE "verbose cuda -> ptx -> embedded build" OFF)
+# option(NVCC_VERBOSE "verbose cuda -> ptx -> embedded build" OFF)
 
 if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
 	# Enable c++11 and hide symbols which shouldn't be visible
@@ -281,16 +282,16 @@ set(IMGUI_SRC
 # include_directories(${TBB_INCLUDE_DIR})
 
 # cuda
-if (${CUDA_TOOLKIT_ROOT_DIR})
-message(INFO " CUDA_TOOLKIT_DIR set to ${CUDA_TOOLKIT_ROOT_DIR}")
-endif()
+# if (${CUDA_TOOLKIT_ROOT_DIR})
+# message(INFO " CUDA_TOOLKIT_DIR set to ${CUDA_TOOLKIT_ROOT_DIR}")
+# endif()
 
-find_package(CUDA REQUIRED)
-include_directories(${CUDA_TOOLKIT_INCLUDE})
-find_program(BIN2C bin2c 
-  HINTS 
-  ${CUDA_TOOLKIT_ROOT_DIR}/bin/
-  /usr/local/cuda/bin)
+# find_package(CUDA REQUIRED)
+# include_directories(${CUDA_TOOLKIT_INCLUDE})
+# find_program(BIN2C bin2c 
+#   HINTS 
+#   ${CUDA_TOOLKIT_ROOT_DIR}/bin/
+#   /usr/local/cuda/bin)
 
 # optix 7
 include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/include/nvisii/utilities/sutil/)
@@ -320,14 +321,17 @@ endif()
 
 # owl
 #set(SET_UP_CONFIGURATIONS_DONE 1)
-#set(OWL_BUILD_SAMPLES OFF CACHE STRING "" FORCE)
-#set(OWL_BUILD_ADVANCED_TESTS OFF CACHE STRING "" FORCE)
+set(OptiX_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/externals/owl/owl/)
+set(OWL_BUILD_SAMPLES OFF CACHE STRING "" FORCE)
+set(OWL_BUILD_ADVANCED_TESTS OFF CACHE STRING "" FORCE)
 add_subdirectory(externals/owl EXCLUDE_FROM_ALL)
+include(embed_ptx)
+
 #include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/externals/owl/)
 #include_directories(SYSTEM ${CMAKE_CURRENT_SOURCE_DIR}/externals/owl/owl/include)
-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/externals/owl/owl/cmake)
-include_directories(${OWL_INCLUDES})
-include(configure_cuda)
+# list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/externals/owl/owl/cmake)
+# include_directories(${OWL_INCLUDES})
+# include(configure_cuda)
 
 # add libraries to a list for linking
 set(
@@ -340,6 +344,8 @@ set(
   #owl
   ${OWL_LIBRARIES}
   assimp
+  OptiX::OptiX
+  owl::owl
   )
 
 set (
@@ -391,23 +397,23 @@ endif()
 # # └──────────────────────────────────────────────────────────────────┘
 
 # For compiling cuda kernels and embedding them as ptx
-if (NVCC_VERBOSE)
-set (NVCC_VERBOSE_FLAG --verbose)
-endif()
-
-macro(cuda_compile_and_embed output_var cuda_file)
-  set(var_name ${output_var})
-  cuda_compile_ptx(ptx_files ${cuda_file} OPTIONS --generate-line-info -use_fast_math -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --keep ${NVCC_VERBOSE_FLAG})
-  list(GET ptx_files 0 ptx_file)
-  set(embedded_file ${ptx_file}_embedded.c)
-  add_custom_command(
-    OUTPUT ${embedded_file}
-    COMMAND ${BIN2C} -c --padd 0 --type char --name ${var_name} ${ptx_file}  > ${embedded_file}
-    DEPENDS ${ptx_file}
-    COMMENT "compiling (and embedding ptx from) ${cuda_file}"
-    )
-  set(${output_var} ${embedded_file})
-endmacro()
+# if (NVCC_VERBOSE)
+# set (NVCC_VERBOSE_FLAG --verbose)
+# endif()
+
+# macro(cuda_compile_and_embed output_var cuda_file)
+#   set(var_name ${output_var})
+#   cuda_compile_ptx(ptx_files ${cuda_file} OPTIONS --generate-line-info -use_fast_math -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored --keep ${NVCC_VERBOSE_FLAG})
+#   list(GET ptx_files 0 ptx_file)
+#   set(embedded_file ${ptx_file}_embedded.c)
+#   add_custom_command(
+#     OUTPUT ${embedded_file}
+#     COMMAND ${BIN2C} -c --padd 0 --type char --name ${var_name} ${ptx_file}  > ${embedded_file}
+#     DEPENDS ${ptx_file}
+#     COMMENT "compiling (and embedding ptx from) ${cuda_file}"
+#     )
+#   set(${output_var} ${embedded_file})
+# endmacro()
 
 # ┌──────────────────────────────────────────────────────────────────┐
 # │  Add source files                                                │
@@ -431,12 +437,32 @@ set(HDR ${HDR} ${Externals_HDR})
 # │  CUDA PTX                                                        │
 # └──────────────────────────────────────────────────────────────────┘
 # OPTIONS -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored
-cuda_compile_and_embed(ptxCode ${SRC_CU})
+# compile_and_embed(ptxCode ${SRC_CU})
+
+embed_ptx(
+  OUTPUT_TARGET
+    deviceCode_ptx
+  EMBEDDED_SYMBOL_NAMES
+    deviceCode_ptx
+  PTX_LINK_LIBRARIES
+    owl::owl
+  SOURCES
+     ${SRC_CU}
+)
+set (
+  LIBRARIES
+  ${LIBRARIES}
+  deviceCode_ptx
+)
 
 # ┌──────────────────────────────────────────────────────────────────┐
 # │  NVISII Library                                                   │
 # └──────────────────────────────────────────────────────────────────┘
-cuda_add_library(nvisii_lib STATIC ${SRC} ${HDR} ${ptxCode} OPTIONS --expt-relaxed-constexpr -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored)
+add_library(nvisii_lib STATIC ${SRC} ${HDR} ${ptxCode} 
+# OPTIONS --expt-relaxed-constexpr -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored
+)
+set_property(TARGET nvisii_lib PROPERTY CUDA_ARCHITECTURES OFF) 
+
 set_target_properties(nvisii_lib PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_link_libraries(nvisii_lib ${LIBRARIES})
 set_target_properties(nvisii_lib PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
diff --git a/cmake/FindOptiX.cmake b/cmake/FindOptiX.cmake
deleted file mode 100644
index 34c3322b..00000000
--- a/cmake/FindOptiX.cmake
+++ /dev/null
@@ -1,178 +0,0 @@
-#
-# Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-
-# Locate the OptiX distribution.  Search relative to the SDK first, then look in the system.
-
-# Our initial guess will be within the SDK.
-if (NOT DEFINED OptiX_INSTALL_DIR)
-  if (WIN32)
-    set(OptiX_INSTALL_DIR "c:/ProgramData/NVIDIA Corporation/OptiX SDK 7.2.0")
-  else()
-    set(OptiX_INSTALL_DIR $ENV{OptiX_INSTALL_DIR})
-  endif()
-endif()
-#set(OptiX_INSTALL_DIR "${CMAKE_SOURCE_DIR}/../" CACHE PATH "Path to OptiX installed location.")
-
-# The distribution contains both 32 and 64 bit libraries.  Adjust the library
-# search path based on the bit-ness of the build.  (i.e. 64: bin64, lib64; 32:
-# bin, lib).  Note that on Mac, the OptiX library is a universal binary, so we
-# only need to look in lib and not lib64 for 64 bit builds.
-if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT APPLE)
-  set(bit_dest "64")
-else()
-  set(bit_dest "")
-endif()
-
-macro(OPTIX_find_api_library name version)
-  find_library(${name}_LIBRARY
-    NAMES ${name}.${version} ${name}
-    PATHS "${OptiX_INSTALL_DIR}/lib${bit_dest}"
-    NO_DEFAULT_PATH
-    )
-  find_library(${name}_LIBRARY
-    NAMES ${name}.${version} ${name}
-    )
-  if(WIN32)
-    find_file(${name}_DLL
-      NAMES ${name}.${version}.dll
-      PATHS "${OptiX_INSTALL_DIR}/bin${bit_dest}"
-      NO_DEFAULT_PATH
-      )
-    find_file(${name}_DLL
-      NAMES ${name}.${version}.dll
-      )
-  endif()
-endmacro()
-
-#OPTIX_find_api_library(optix 70)
-#OPTIX_find_api_library(optixu 1)
-#OPTIX_find_api_library(optix_prime 1)
-
-# Include
-find_path(OptiX_INCLUDE
-  NAMES optix.h
-  PATHS "${OptiX_INSTALL_DIR}/include"
-  NO_DEFAULT_PATH
-  )
-find_path(OptiX_INCLUDE
-  NAMES optix.h
-  )
-
-# Check to make sure we found what we were looking for
-function(OptiX_report_error error_message required)
-  if(OptiX_FIND_REQUIRED AND required)
-    message(FATAL_ERROR "${error_message}")
-  else()
-    if(NOT OptiX_FIND_QUIETLY)
-      message(STATUS "${error_message}")
-    endif(NOT OptiX_FIND_QUIETLY)
-  endif()
-endfunction()
-
-#if(NOT optix_LIBRARY)
-#  OptiX_report_error("optix library not found.  Please locate before proceeding." TRUE)
-#endif()
-if(NOT OptiX_INCLUDE)
-  OptiX_report_error("OptiX headers (optix.h and friends) not found.  Please locate before proceeding." TRUE)
-endif()
-#if(NOT optix_prime_LIBRARY)
-#  OptiX_report_error("optix Prime library not found.  Please locate before proceeding." FALSE)
-#endif()
-
-# Macro for setting up dummy targets
-function(OptiX_add_imported_library name lib_location dll_lib dependent_libs)
-  set(CMAKE_IMPORT_FILE_VERSION 1)
-
-  # Create imported target
-#  add_library(${name} SHARED IMPORTED)
-
-  # Import target "optix" for configuration "Debug"
-  # if(WIN32)
-  #   set_target_properties(${name} PROPERTIES
-  #     IMPORTED_IMPLIB "${lib_location}"
-  #     #IMPORTED_LINK_INTERFACE_LIBRARIES "glu32;opengl32"
-  #     IMPORTED_LOCATION "${dll_lib}"
-  #     IMPORTED_LINK_INTERFACE_LIBRARIES "${dependent_libs}"
-  #     )
-  # elseif(UNIX)
-  #   set_target_properties(${name} PROPERTIES
-  #     #IMPORTED_LINK_INTERFACE_LIBRARIES "glu32;opengl32"
-  #     IMPORTED_LOCATION "${lib_location}"
-  #     # We don't have versioned filenames for now, and it may not even matter.
-  #     #IMPORTED_SONAME "${optix_soname}"
-  #     IMPORTED_LINK_INTERFACE_LIBRARIES "${dependent_libs}"
-  #     )
-  # else()
-  #   # Unknown system, but at least try and provide the minimum required
-  #   # information.
-  #   set_target_properties(${name} PROPERTIES
-  #     IMPORTED_LOCATION "${lib_location}"
-  #     IMPORTED_LINK_INTERFACE_LIBRARIES "${dependent_libs}"
-  #     )
-  # endif()
-
-  # Commands beyond this point should not need to know the version.
-  set(CMAKE_IMPORT_FILE_VERSION)
-endfunction()
-
-# Sets up a dummy target
-OptiX_add_imported_library(optix "${optix_LIBRARY}" "${optix_DLL}" "${OPENGL_LIBRARIES}")
-#OptiX_add_imported_library(optixu   "${optixu_LIBRARY}"   "${optixu_DLL}"   "")
-#OptiX_add_imported_library(optix_prime "${optix_prime_LIBRARY}"  "${optix_prime_DLL}"  "")
-
-macro(OptiX_check_same_path libA libB)
-  if(_optix_path_to_${libA})
-    if(NOT _optix_path_to_${libA} STREQUAL _optix_path_to_${libB})
-      # ${libA} and ${libB} are in different paths.  Make sure there isn't a ${libA} next
-      # to the ${libB}.
-      get_filename_component(_optix_name_of_${libA} "${${libA}_LIBRARY}" NAME)
-      if(EXISTS "${_optix_path_to_${libB}}/${_optix_name_of_${libA}}")
-        message(WARNING " ${libA} library found next to ${libB} library that is not being used.  Due to the way we are using rpath, the copy of ${libA} next to ${libB} will be used during loading instead of the one you intended.  Consider putting the libraries in the same directory or moving ${_optix_path_to_${libB}}/${_optix_name_of_${libA} out of the way.")
-      endif()
-    endif()
-    set( _${libA}_rpath "-Wl,-rpath,${_optix_path_to_${libA}}" )
-  endif()
-endmacro()
-
-# Since liboptix.1.dylib is built with an install name of @rpath, we need to
-# compile our samples with the rpath set to where optix exists.
-if(APPLE)
-  get_filename_component(_optix_path_to_optix "${optix_LIBRARY}" PATH)
-  if(_optix_path_to_optix)
-    set( _optix_rpath "-Wl,-rpath,${_optix_path_to_optix}" )
-  endif()
-  get_filename_component(_optix_path_to_optixu "${optixu_LIBRARY}" PATH)
-  OptiX_check_same_path(optixu optix)
-  get_filename_component(_optix_path_to_optix_prime "${optix_prime_LIBRARY}" PATH)
-  OptiX_check_same_path(optix_prime optix)
-  OptiX_check_same_path(optix_prime optixu)
-
-  set( optix_rpath ${_optix_rpath} ${_optixu_rpath} ${_optix_prime_rpath} )
-  list(REMOVE_DUPLICATES optix_rpath)
-endif()
-
diff --git a/externals/owl b/externals/owl
index 0f82536c..986455ea 160000
--- a/externals/owl
+++ b/externals/owl
@@ -1 +1 @@
-Subproject commit 0f82536cd56668ca786d4f5e8eb796e6b03752c9
+Subproject commit 986455ea3d79eab4fc97e41d152396a46bf4db42
diff --git a/src/nvisii/devicecode/path_tracer.cu b/src/nvisii/devicecode/path_tracer.cu
index a8837429..f4500979 100644
--- a/src/nvisii/devicecode/path_tracer.cu
+++ b/src/nvisii/devicecode/path_tracer.cu
@@ -617,6 +617,7 @@ void initializeRenderData(float3 &renderData)
     else if (LP.renderDataMode == RenderDataFlags::BASE_COLOR) {
         renderData = make_float3(0.0, 0.0, 0.0);
     }
+    
     else if (LP.renderDataMode == RenderDataFlags::TEXTURE_COORDINATES) {
         renderData = make_float3(0.0, 0.0, 0.0);
     }
diff --git a/src/nvisii/nvisii.cpp b/src/nvisii/nvisii.cpp
index bc8ab326..324273b1 100644
--- a/src/nvisii/nvisii.cpp
+++ b/src/nvisii/nvisii.cpp
@@ -70,7 +70,7 @@ static struct WindowData {
 } WindowData;
 
 /* Embedded via cmake */
-extern "C" char ptxCode[];
+extern "C" char deviceCode_ptx[];
 
 // struct MeshData {
 //     OWLBuffer vertices;
@@ -486,7 +486,7 @@ void initializeOptix(bool headless)
     owlEnableMotionBlur(OD.context);
     owlContextSetRayTypeCount(OD.context, 2);
     cudaSetDevice(0); // OWL leaves the device as num_devices - 1 after the context is created. set it back to 0.
-    OD.module = owlModuleCreate(OD.context, ptxCode);
+    OD.module = owlModuleCreate(OD.context, deviceCode_ptx);
     
     /* Setup Optix Launch Params */
     OWLVarDecl launchParamVars[] = {

From d89b46ac376f6293055c663b08dd1fbcd126f2e9 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Sat, 9 Jul 2022 11:11:41 -0600
Subject: [PATCH 37/55] Update build.yml

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2fec48e8..2834da08 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -25,7 +25,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, windows-2019]
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.6', '3.7', '3.8', '3.9'] #, '3.10'
         optix-version: [70, 71, 72]
         include:
           # Includes the value for matrix.container when the OS is Ubuntu, to use manylinux complaint CentOS version.

From 8cb1d79ff7db045eb17d8793eff59e61f77a3abc Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Sat, 9 Jul 2022 16:47:02 -0600
Subject: [PATCH 38/55] Update build.yml

---
 .github/workflows/build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e9e61df5..07863ec1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -24,9 +24,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, windows-2019]
-        python-version: ['3.6', '3.7', '3.8', '3.9'] #, '3.10'
-        optix-version: [70, 71, 72]
+        os: [windows-2019] #ubuntu-20.04, 
+        python-version: ['3.6'] # , '3.7', '3.8', '3.9' #, '3.10'
+        optix-version: [70] # , 71, 72
         include:
           # Includes the value for matrix.container when the OS is Ubuntu, to use manylinux complaint CentOS version.
           # For Windows, matrix.container remains undefined and the build runs on the host VM instead of within a container.

From 0d3bc0392f999e2203a1083e6295ccce9da5c5df Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Sat, 9 Jul 2022 22:31:38 -0600
Subject: [PATCH 39/55] Update build.yml

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 07863ec1..dcb9530a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -173,7 +173,7 @@ jobs:
           $CUDA_VERSION_FULL = "10.2.89"
           $CUDA_REPO_PKG_REMOTE = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"
           $CUDA_REPO_PKG_LOCAL = "cuda_10.2.89_win10_network.exe"
-          $CUDA_PACKAGES = "nvcc_10.2 cudart_10.2"
+          $CUDA_PACKAGES = "nvcc_10.2 cudart_10.2 visual_studio_integration"
           # Download the cuda network installer
           Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
           # Invoke silent install of CUDA (via network installer)

From 62a3860fedd3fe71218d20aba1cae75854281b13 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Sat, 9 Jul 2022 22:51:36 -0600
Subject: [PATCH 40/55] Update build.yml

---
 .github/workflows/build.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dcb9530a..01a1fd19 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -187,7 +187,9 @@ jobs:
           pip install --upgrade setuptools setuptools_scm wheel numpy==1.19.5
           mkdir build
           cd build
-          cmake ../ -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe" -DCUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2" -DPYTHON_VERSION="${{matrix.python-version}}"
+          $CUDACXX="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe"
+          echo $CUDACXX
+          cmake ../ -DPYTHON_VERSION="${{matrix.python-version}}"
           cmake --build . --config Release --target install 
           cd ..
           cd install

From 96653ef79a3a3a328762c737572a8d9ceffaf726 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Sat, 9 Jul 2022 22:58:36 -0600
Subject: [PATCH 41/55] Update CMakeLists.txt

---
 CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25e5a825..2a6322ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,17 @@
 # │  Projects Settings                                               │
 # └──────────────────────────────────────────────────────────────────┘
 cmake_minimum_required (VERSION 3.17)
+
+MESSAGE(WARN CUDACXX is $ENV{CUDACXX} )
+MESSAGE(WARN CUDA_PATH is $ENV{CUDA_PATH} )
+MESSAGE(WARN CUDAHOSTCXX is $ENV{CUDAHOSTCXX} )
+MESSAGE(WARN CUDAARCHS is $ENV{CUDAARCHS} )
+MESSAGE(WARN CUDAToolkit_ROOT is $ENV{CUDAToolkit_ROOT})
+
 project(NVISII CXX C CUDA)
+
+MESSAGE(now we are past line 10)
+
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 # set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/owl/owl/cmake")
@@ -528,4 +538,4 @@ if(hasParent)
   set(OWL_VIEWER_LIBRARIES ${OWL_VIEWER_LIBRARIES} PARENT_SCOPE)
   set(OWL_HAVE_TBB ${OWL_HAVE_TBB} PARENT_SCOPE)
   set(OWL_CXX_FLAGS ${OWL_CXX_FLAGS} PARENT_SCOPE)
-endif()
\ No newline at end of file
+endif()

From d209775ed5bc21f1c8883047eb5cbfe370bb0722 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Sun, 10 Jul 2022 13:00:15 -0600
Subject: [PATCH 42/55] Update build.yml

---
 .github/workflows/build.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 01a1fd19..2aa7f87f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -183,11 +183,12 @@ jobs:
         if: runner.os == 'Windows'
         env:
           OPTIX_VERSION: ${{ matrix.optix-version }}
+          CUDACXX: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe
         run: |
           pip install --upgrade setuptools setuptools_scm wheel numpy==1.19.5
           mkdir build
           cd build
-          $CUDACXX="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe"
+          echo "printing CUDACXX"
           echo $CUDACXX
           cmake ../ -DPYTHON_VERSION="${{matrix.python-version}}"
           cmake --build . --config Release --target install 

From 4f091ea1f3aaec5dae9752f886f4cd8d55475475 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Mon, 11 Jul 2022 16:27:13 -0600
Subject: [PATCH 43/55] Update build.yml

---
 .github/workflows/build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2aa7f87f..410c236e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -184,6 +184,7 @@ jobs:
         env:
           OPTIX_VERSION: ${{ matrix.optix-version }}
           CUDACXX: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe
+          CUDA_PATH: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/
         run: |
           pip install --upgrade setuptools setuptools_scm wheel numpy==1.19.5
           mkdir build

From 47abaaac0e8a012647fb2538a3c56a89c450c832 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Mon, 11 Jul 2022 16:39:28 -0600
Subject: [PATCH 44/55] Update build.yml

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 410c236e..af600311 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -191,7 +191,7 @@ jobs:
           cd build
           echo "printing CUDACXX"
           echo $CUDACXX
-          cmake ../ -DPYTHON_VERSION="${{matrix.python-version}}"
+          cmake ../ -DPYTHON_VERSION="${{matrix.python-version}}" -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe"
           cmake --build . --config Release --target install 
           cd ..
           cd install

From a089dde74be0a4f2c704fef0668db4b771a3932c Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Mon, 11 Jul 2022 16:51:41 -0600
Subject: [PATCH 45/55] Update build.yml

---
 .github/workflows/build.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index af600311..9ab63d63 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -169,11 +169,12 @@ jobs:
       - name: Windows Dependencies (CUDA)
         if: runner.os == 'Windows'
         shell: powershell
+        env:
+          CUDA_VERSION_FULL: 10.2.89
+          CUDA_REPO_PKG_REMOTE: http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe
+          CUDA_REPO_PKG_LOCAL: cuda_10.2.89_win10_network.exe
+          CUDA_PACKAGES: nvcc_10.2 cudart_10.2 visual_studio_integration
         run: |
-          $CUDA_VERSION_FULL = "10.2.89"
-          $CUDA_REPO_PKG_REMOTE = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"
-          $CUDA_REPO_PKG_LOCAL = "cuda_10.2.89_win10_network.exe"
-          $CUDA_PACKAGES = "nvcc_10.2 cudart_10.2 visual_studio_integration"
           # Download the cuda network installer
           Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
           # Invoke silent install of CUDA (via network installer)

From 89c9b5c2eef2f97e9e6366a28c2a7d1b30dcf1f1 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Mon, 11 Jul 2022 16:56:34 -0600
Subject: [PATCH 46/55] Update build.yml

---
 .github/workflows/build.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9ab63d63..e61c1d5f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -170,11 +170,17 @@ jobs:
         if: runner.os == 'Windows'
         shell: powershell
         env:
-          CUDA_VERSION_FULL: 10.2.89
-          CUDA_REPO_PKG_REMOTE: http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe
-          CUDA_REPO_PKG_LOCAL: cuda_10.2.89_win10_network.exe
-          CUDA_PACKAGES: nvcc_10.2 cudart_10.2 visual_studio_integration
+          CUDA_VERSION_FULL: "10.2.89"
+          CUDA_REPO_PKG_REMOTE: "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"
+          CUDA_REPO_PKG_LOCAL: "cuda_10.2.89_win10_network.exe"
+          CUDA_PACKAGES: "nvcc_10.2 cudart_10.2 visual_studio_integration"
         run: |
+          echo "printing CUDA_REPO_PKG_REMOTE"
+          echo $CUDA_REPO_PKG_REMOTE
+          echo "printing CUDA_REPO_PKG_LOCAL"
+          echo $CUDA_REPO_PKG_LOCAL
+          echo "printing CUDA_PACKAGES"
+          echo $CUDA_PACKAGES
           # Download the cuda network installer
           Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
           # Invoke silent install of CUDA (via network installer)

From 8c5f1bb60bb30c0ae05a3f6b6414e0164a09b210 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Mon, 11 Jul 2022 17:01:51 -0600
Subject: [PATCH 47/55] Update build.yml

---
 .github/workflows/build.yml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e61c1d5f..fb21f73c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -175,12 +175,17 @@ jobs:
           CUDA_REPO_PKG_LOCAL: "cuda_10.2.89_win10_network.exe"
           CUDA_PACKAGES: "nvcc_10.2 cudart_10.2 visual_studio_integration"
         run: |
+          $CUDA_VERSION_FULL="10.2.89"
+          $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"
+          $CUDA_REPO_PKG_LOCAL="cuda_10.2.89_win10_network.exe"
+          $CUDA_PACKAGES="nvcc_10.2 cudart_10.2 visual_studio_integration"
+          
           echo "printing CUDA_REPO_PKG_REMOTE"
-          echo $CUDA_REPO_PKG_REMOTE
+          echo %CUDA_REPO_PKG_REMOTE%
           echo "printing CUDA_REPO_PKG_LOCAL"
-          echo $CUDA_REPO_PKG_LOCAL
+          echo %CUDA_REPO_PKG_LOCAL%
           echo "printing CUDA_PACKAGES"
-          echo $CUDA_PACKAGES
+          echo %CUDA_PACKAGES%
           # Download the cuda network installer
           Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
           # Invoke silent install of CUDA (via network installer)

From a75b50ef63bc9a183cb6bdeafa8504f9ea9ebc48 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Wed, 13 Jul 2022 11:34:14 -0600
Subject: [PATCH 48/55] Update build.yml

---
 .github/workflows/build.yml | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fb21f73c..18e59feb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -169,16 +169,11 @@ jobs:
       - name: Windows Dependencies (CUDA)
         if: runner.os == 'Windows'
         shell: powershell
-        env:
-          CUDA_VERSION_FULL: "10.2.89"
-          CUDA_REPO_PKG_REMOTE: "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"
-          CUDA_REPO_PKG_LOCAL: "cuda_10.2.89_win10_network.exe"
-          CUDA_PACKAGES: "nvcc_10.2 cudart_10.2 visual_studio_integration"
         run: |
-          $CUDA_VERSION_FULL="10.2.89"
-          $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe"
-          $CUDA_REPO_PKG_LOCAL="cuda_10.2.89_win10_network.exe"
-          $CUDA_PACKAGES="nvcc_10.2 cudart_10.2 visual_studio_integration"
+          $CUDA_VERSION_FULL="11.7.0"
+          $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"
+          $CUDA_REPO_PKG_LOCAL="cuda_11.7.0_windows_network.exe"
+          $CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
           
           echo "printing CUDA_REPO_PKG_REMOTE"
           echo %CUDA_REPO_PKG_REMOTE%
@@ -195,15 +190,15 @@ jobs:
         if: runner.os == 'Windows'
         env:
           OPTIX_VERSION: ${{ matrix.optix-version }}
-          CUDACXX: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe
-          CUDA_PATH: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/
+          CUDACXX: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin/nvcc.exe
+          CUDA_PATH: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/
         run: |
           pip install --upgrade setuptools setuptools_scm wheel numpy==1.19.5
           mkdir build
           cd build
           echo "printing CUDACXX"
           echo $CUDACXX
-          cmake ../ -DPYTHON_VERSION="${{matrix.python-version}}" -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.2/bin/nvcc.exe"
+          cmake ../ -DPYTHON_VERSION="${{matrix.python-version}}" -DCMAKE_CUDA_COMPILER="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin/nvcc.exe"
           cmake --build . --config Release --target install 
           cd ..
           cd install

From 4b8a2c9984a4e251918e6d8a32b5a62ec1d2e48b Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 21 Sep 2022 10:20:29 -0600
Subject: [PATCH 49/55] updating yml

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 18e59feb..f1b49829 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -24,7 +24,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-2019] #ubuntu-20.04, 
+        os: [ubuntu-20.04, windows-2019]
         python-version: ['3.6'] # , '3.7', '3.8', '3.9' #, '3.10'
         optix-version: [70] # , 71, 72
         include:

From 6b38df7fe182e6eab001b17723afeedd415ffa3e Mon Sep 17 00:00:00 2001
From: n8vm <natemorrical@gmail.com>
Date: Wed, 21 Sep 2022 10:35:59 -0600
Subject: [PATCH 50/55] updating build yml

---
 .github/workflows/build.yml | 207 ++++++++++++++++++------------------
 1 file changed, 105 insertions(+), 102 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f1b49829..71f59835 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -44,120 +44,120 @@ jobs:
         with:
           fetch-depth: 0
       
-      - name: Linux Dependencies (CUDA, X, Python)
-        if: runner.os == 'Linux'
-        shell: bash
-        run: |
-          # Install Python Dependencies
-          PY=${{ matrix.python-version }}
-          [[ ${PY:2:1} < 8 ]] && M=m || M=""
-          PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
-          PYEXEC=/opt/python/${PYVER}/bin/python
-          $PYEXEC -m pip install --upgrade pip
-          $PYEXEC -m pip install setuptools setuptools_scm numpy==1.19.5
+      # - name: Linux Dependencies (CUDA, X, Python)
+      #   if: runner.os == 'Linux'
+      #   shell: bash
+      #   run: |
+      #     # Install Python Dependencies
+      #     PY=${{ matrix.python-version }}
+      #     [[ ${PY:2:1} < 8 ]] && M=m || M=""
+      #     PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
+      #     PYEXEC=/opt/python/${PYVER}/bin/python
+      #     $PYEXEC -m pip install --upgrade pip
+      #     $PYEXEC -m pip install setuptools setuptools_scm numpy==1.19.5
           
-          # Install CMake
-          $PYEXEC -m pip install cmake
-          CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
-          $CMAKEEXEC --version          
+      #     # Install CMake
+      #     $PYEXEC -m pip install cmake
+      #     CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
+      #     $CMAKEEXEC --version          
           
-          # Install CUDA
-          yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-          yum clean all
-          yum -y erase devtoolset-9-binutils devtoolset-9-gcc devtoolset-9-gcc-c++ devtoolset-9-gcc-gfortran 
-          yum -y install devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-gcc-c++
-          yum -y install cuda-compiler-10-2 cuda-cudart-dev-10-2 cuda-minimal-build-10-2 
-          yum -y install libXi-devel
-          yum -y install xorg-x11-server-devel
-          yum -y install libXinerama-devel
-          yum -y install glfw-devel
-          yum -y install wget
-          yum -y install pcre-devel
-          yum -y install unar  
-          yum -y install zip      
+      #     # Install CUDA
+      #     yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+      #     yum clean all
+      #     yum -y erase devtoolset-9-binutils devtoolset-9-gcc devtoolset-9-gcc-c++ devtoolset-9-gcc-gfortran 
+      #     yum -y install devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-gcc-c++
+      #     yum -y install cuda-compiler-10-2 cuda-cudart-dev-10-2 cuda-minimal-build-10-2 
+      #     yum -y install libXi-devel
+      #     yum -y install xorg-x11-server-devel
+      #     yum -y install libXinerama-devel
+      #     yum -y install glfw-devel
+      #     yum -y install wget
+      #     yum -y install pcre-devel
+      #     yum -y install unar  
+      #     yum -y install zip      
           
-          # This symlink comes as a part of the cuda-10.2 package, which is being specifically avoided,
-          # but this symlink makes CMake's FindCUDA behave nicely.
-          ln -s /usr/local/cuda-10.2 /usr/local/cuda
+      #     # This symlink comes as a part of the cuda-10.2 package, which is being specifically avoided,
+      #     # but this symlink makes CMake's FindCUDA behave nicely.
+      #     ln -s /usr/local/cuda-10.2 /usr/local/cuda
           
-          source /opt/rh/devtoolset-8/enable
+      #     source /opt/rh/devtoolset-8/enable
           
-          # Build a version of SWIG we can use
-          mkdir build
-          cd build
-          wget http://prdownloads.sourceforge.net/swig/swig-4.0.2.tar.gz
-          tar xzf swig-4.0.2.tar.gz
-          cd swig-4.0.2
-          ./configure --prefix $(pwd)
-          make
-          make install
-          ls
-          cd ../
+      #     # Build a version of SWIG we can use
+      #     mkdir build
+      #     cd build
+      #     wget http://prdownloads.sourceforge.net/swig/swig-4.0.2.tar.gz
+      #     tar xzf swig-4.0.2.tar.gz
+      #     cd swig-4.0.2
+      #     ./configure --prefix $(pwd)
+      #     make
+      #     make install
+      #     ls
+      #     cd ../
       
-      - name: Linux Build
-        if: runner.os == 'Linux'
-        shell: bash
-        env:
-          OPTIX_VERSION: ${{ matrix.optix-version }}
-        run: |
-          source /opt/rh/devtoolset-8/enable
+      # - name: Linux Build
+      #   if: runner.os == 'Linux'
+      #   shell: bash
+      #   env:
+      #     OPTIX_VERSION: ${{ matrix.optix-version }}
+      #   run: |
+      #     source /opt/rh/devtoolset-8/enable
           
-          PY=${{ matrix.python-version }}
-          [[ ${PY:2:1} < 8 ]] && M=m || M=""
-          PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
-          PYEXEC=/opt/python/${PYVER}/bin/python
-          CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
+      #     PY=${{ matrix.python-version }}
+      #     [[ ${PY:2:1} < 8 ]] && M=m || M=""
+      #     PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
+      #     PYEXEC=/opt/python/${PYVER}/bin/python
+      #     CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
           
-          cd build 
+      #     cd build 
           
-          $CMAKEEXEC ../ \
-          -DCMAKE_CUDA_COMPILER=/usr/local/cuda-10.2/bin/nvcc \
-          -DCUDA_CUDA_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/stubs/libcuda.so \
-          -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-10.2/bin/nvcc \
-          -DCUDA_INCLUDE_DIRS=/usr/local/cuda-10.2/targets/x86_64-linux/include \
-          -DCUDA_CUDART_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so \
-          -DSWIG_DIR="./swig-4.0.2/share/swig/4.0.2/" \
-          -DSWIG_EXECUTABLE="swig-4.0.2/bin/swig" \
-          -DPython_INCLUDE_DIRS=/opt/python/${PYVER}/include/python${PY:0:1}.${PY:2:1}${M}/ \
-          -DPython_NumPy_INCLUDE_DIRS=/opt/python/${PYVER}/lib/python${PY:0:1}.${PY:2:1}/site-packages/numpy/core/include/ \
-          -DPython_VERSION_MAJOR=${PY:0:1} \
-          -DPython_VERSION_MINOR=${PY:2:2} \
-          -DCMAKE_BUILD_TYPE=Release \
+      #     $CMAKEEXEC ../ \
+      #     -DCMAKE_CUDA_COMPILER=/usr/local/cuda-10.2/bin/nvcc \
+      #     -DCUDA_CUDA_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/stubs/libcuda.so \
+      #     -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-10.2/bin/nvcc \
+      #     -DCUDA_INCLUDE_DIRS=/usr/local/cuda-10.2/targets/x86_64-linux/include \
+      #     -DCUDA_CUDART_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so \
+      #     -DSWIG_DIR="./swig-4.0.2/share/swig/4.0.2/" \
+      #     -DSWIG_EXECUTABLE="swig-4.0.2/bin/swig" \
+      #     -DPython_INCLUDE_DIRS=/opt/python/${PYVER}/include/python${PY:0:1}.${PY:2:1}${M}/ \
+      #     -DPython_NumPy_INCLUDE_DIRS=/opt/python/${PYVER}/lib/python${PY:0:1}.${PY:2:1}/site-packages/numpy/core/include/ \
+      #     -DPython_VERSION_MAJOR=${PY:0:1} \
+      #     -DPython_VERSION_MINOR=${PY:2:2} \
+      #     -DCMAKE_BUILD_TYPE=Release \
            
-          $CMAKEEXEC --build . --config Release --target install
-          cd ..
-          cd install
+      #     $CMAKEEXEC --build . --config Release --target install
+      #     cd ..
+      #     cd install
           
-          # need to temporarily remove libcuda.so.1 from library
-          cd nvisii
-          patchelf --remove-needed libcuda.so.1 _nvisii.so
-          cd ../
+      #     # need to temporarily remove libcuda.so.1 from library
+      #     cd nvisii
+      #     patchelf --remove-needed libcuda.so.1 _nvisii.so
+      #     cd ../
           
-          # now make the bdistwheel
-          $PYEXEC setup.py bdist_wheel
-          cd dist
+      #     # now make the bdistwheel
+      #     $PYEXEC setup.py bdist_wheel
+      #     cd dist
           
-          # audit the bdistwheel for use on all linux distros
-          # and use the same dir for both .py and .so files
-          auditwheel repair -L "" *.whl  
+      #     # audit the bdistwheel for use on all linux distros
+      #     # and use the same dir for both .py and .so files
+      #     auditwheel repair -L "" *.whl  
           
-          # add back on the libcuda.so.1 to the library
-          cd wheelhouse
-          unar -d *.whl
-          cd nvisii*
-          cd nvisii
-          patchelf --add-needed libcuda.so.1 _nvisii.so
-          # list libraries for manual verification
-          ldd _nvisii.so
-          cd ..
-          cd ..
-          rm *.whl
-          NAME=$(ls -1 .)
-          cd ${NAME}
-          zip -r ../${NAME}.whl ./*
-          # move the modified manylinux wheel to the "install/dist" folder to be uploaded
-          rm ../../*.whl
-          cp ../*.whl ../../
+      #     # add back on the libcuda.so.1 to the library
+      #     cd wheelhouse
+      #     unar -d *.whl
+      #     cd nvisii*
+      #     cd nvisii
+      #     patchelf --add-needed libcuda.so.1 _nvisii.so
+      #     # list libraries for manual verification
+      #     ldd _nvisii.so
+      #     cd ..
+      #     cd ..
+      #     rm *.whl
+      #     NAME=$(ls -1 .)
+      #     cd ${NAME}
+      #     zip -r ../${NAME}.whl ./*
+      #     # move the modified manylinux wheel to the "install/dist" folder to be uploaded
+      #     rm ../../*.whl
+      #     cp ../*.whl ../../
       
       # Python configuration by github is only valid on their VMs (not inside the manylinux container), so only run on windows.
       - name: Configure Python on Windows
@@ -169,18 +169,21 @@ jobs:
       - name: Windows Dependencies (CUDA)
         if: runner.os == 'Windows'
         shell: powershell
+        # $CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
         run: |
           $CUDA_VERSION_FULL="11.7.0"
           $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"
           $CUDA_REPO_PKG_LOCAL="cuda_11.7.0_windows_network.exe"
-          $CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
-          
+
           echo "printing CUDA_REPO_PKG_REMOTE"
           echo %CUDA_REPO_PKG_REMOTE%
+          echo $CUDA_REPO_PKG_REMOTE%
           echo "printing CUDA_REPO_PKG_LOCAL"
           echo %CUDA_REPO_PKG_LOCAL%
+          echo $CUDA_REPO_PKG_LOCAL
           echo "printing CUDA_PACKAGES"
           echo %CUDA_PACKAGES%
+          echo $CUDA_PACKAGES
           # Download the cuda network installer
           Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
           # Invoke silent install of CUDA (via network installer)

From bf8df3ae14b79bbe67107effcdf66f863cb64b49 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Wed, 21 Sep 2022 10:58:24 -0600
Subject: [PATCH 51/55] Update build.yml

---
 .github/workflows/build.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 71f59835..7edfb0d0 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -169,11 +169,11 @@ jobs:
       - name: Windows Dependencies (CUDA)
         if: runner.os == 'Windows'
         shell: powershell
-        # $CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
         run: |
           $CUDA_VERSION_FULL="11.7.0"
           $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"
           $CUDA_REPO_PKG_LOCAL="cuda_11.7.0_windows_network.exe"
+          $CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
 
           echo "printing CUDA_REPO_PKG_REMOTE"
           echo %CUDA_REPO_PKG_REMOTE%
@@ -195,6 +195,7 @@ jobs:
           OPTIX_VERSION: ${{ matrix.optix-version }}
           CUDACXX: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin/nvcc.exe
           CUDA_PATH: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/
+          CudaToolkitDir: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/
         run: |
           pip install --upgrade setuptools setuptools_scm wheel numpy==1.19.5
           mkdir build

From d5428311881fc70e6dc2a56499001f8078b094b5 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Wed, 21 Sep 2022 11:02:23 -0600
Subject: [PATCH 52/55] Update build.yml

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7edfb0d0..be6a20a2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -166,14 +166,14 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
       
+      #$CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
       - name: Windows Dependencies (CUDA)
         if: runner.os == 'Windows'
         shell: powershell
         run: |
           $CUDA_VERSION_FULL="11.7.0"
           $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/11.7.0/network_installers/cuda_11.7.0_windows_network.exe"
-          $CUDA_REPO_PKG_LOCAL="cuda_11.7.0_windows_network.exe"
-          $CUDA_PACKAGES="nvcc_11.7 cudart_11.7 visual_studio_integration"
+          $CUDA_REPO_PKG_LOCAL="cuda_11.7.0_windows_network.exe"  
 
           echo "printing CUDA_REPO_PKG_REMOTE"
           echo %CUDA_REPO_PKG_REMOTE%

From 3c176e6ff1ad40ae0fbbda23db1a30116b46bd36 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Wed, 21 Sep 2022 11:31:24 -0600
Subject: [PATCH 53/55] Update build.yml

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index be6a20a2..20648989 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -25,8 +25,8 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, windows-2019]
-        python-version: ['3.6'] # , '3.7', '3.8', '3.9' #, '3.10'
-        optix-version: [70] # , 71, 72
+        python-version: ['3.6', '3.7', '3.8', '3.9'] # #, '3.10'
+        optix-version: [70, 71, 72] # 
         include:
           # Includes the value for matrix.container when the OS is Ubuntu, to use manylinux complaint CentOS version.
           # For Windows, matrix.container remains undefined and the build runs on the host VM instead of within a container.

From 820d85a68f86f6bf9f2d22bd4a4aac67fd24dc11 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Wed, 21 Sep 2022 11:33:16 -0600
Subject: [PATCH 54/55] Update build.yml

---
 .github/workflows/build.yml | 200 ++++++++++++++++++------------------
 1 file changed, 100 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 20648989..70d272ca 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -44,120 +44,120 @@ jobs:
         with:
           fetch-depth: 0
       
-      # - name: Linux Dependencies (CUDA, X, Python)
-      #   if: runner.os == 'Linux'
-      #   shell: bash
-      #   run: |
-      #     # Install Python Dependencies
-      #     PY=${{ matrix.python-version }}
-      #     [[ ${PY:2:1} < 8 ]] && M=m || M=""
-      #     PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
-      #     PYEXEC=/opt/python/${PYVER}/bin/python
-      #     $PYEXEC -m pip install --upgrade pip
-      #     $PYEXEC -m pip install setuptools setuptools_scm numpy==1.19.5
+      - name: Linux Dependencies (CUDA, X, Python)
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          # Install Python Dependencies
+          PY=${{ matrix.python-version }}
+          [[ ${PY:2:1} < 8 ]] && M=m || M=""
+          PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
+          PYEXEC=/opt/python/${PYVER}/bin/python
+          $PYEXEC -m pip install --upgrade pip
+          $PYEXEC -m pip install setuptools setuptools_scm numpy==1.19.5
           
-      #     # Install CMake
-      #     $PYEXEC -m pip install cmake
-      #     CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
-      #     $CMAKEEXEC --version          
+          # Install CMake
+          $PYEXEC -m pip install cmake
+          CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
+          $CMAKEEXEC --version          
           
-      #     # Install CUDA
-      #     yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-      #     yum clean all
-      #     yum -y erase devtoolset-9-binutils devtoolset-9-gcc devtoolset-9-gcc-c++ devtoolset-9-gcc-gfortran 
-      #     yum -y install devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-gcc-c++
-      #     yum -y install cuda-compiler-10-2 cuda-cudart-dev-10-2 cuda-minimal-build-10-2 
-      #     yum -y install libXi-devel
-      #     yum -y install xorg-x11-server-devel
-      #     yum -y install libXinerama-devel
-      #     yum -y install glfw-devel
-      #     yum -y install wget
-      #     yum -y install pcre-devel
-      #     yum -y install unar  
-      #     yum -y install zip      
+          # Install CUDA
+          yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+          yum clean all
+          yum -y erase devtoolset-9-binutils devtoolset-9-gcc devtoolset-9-gcc-c++ devtoolset-9-gcc-gfortran 
+          yum -y install devtoolset-8-binutils devtoolset-8-gcc devtoolset-8-gcc-c++
+          yum -y install cuda-compiler-10-2 cuda-cudart-dev-10-2 cuda-minimal-build-10-2 
+          yum -y install libXi-devel
+          yum -y install xorg-x11-server-devel
+          yum -y install libXinerama-devel
+          yum -y install glfw-devel
+          yum -y install wget
+          yum -y install pcre-devel
+          yum -y install unar  
+          yum -y install zip      
           
-      #     # This symlink comes as a part of the cuda-10.2 package, which is being specifically avoided,
-      #     # but this symlink makes CMake's FindCUDA behave nicely.
-      #     ln -s /usr/local/cuda-10.2 /usr/local/cuda
+          # This symlink comes as a part of the cuda-10.2 package, which is being specifically avoided,
+          # but this symlink makes CMake's FindCUDA behave nicely.
+          ln -s /usr/local/cuda-10.2 /usr/local/cuda
           
-      #     source /opt/rh/devtoolset-8/enable
+          source /opt/rh/devtoolset-8/enable
           
-      #     # Build a version of SWIG we can use
-      #     mkdir build
-      #     cd build
-      #     wget http://prdownloads.sourceforge.net/swig/swig-4.0.2.tar.gz
-      #     tar xzf swig-4.0.2.tar.gz
-      #     cd swig-4.0.2
-      #     ./configure --prefix $(pwd)
-      #     make
-      #     make install
-      #     ls
-      #     cd ../
+          # Build a version of SWIG we can use
+          mkdir build
+          cd build
+          wget http://prdownloads.sourceforge.net/swig/swig-4.0.2.tar.gz
+          tar xzf swig-4.0.2.tar.gz
+          cd swig-4.0.2
+          ./configure --prefix $(pwd)
+          make
+          make install
+          ls
+          cd ../
       
-      # - name: Linux Build
-      #   if: runner.os == 'Linux'
-      #   shell: bash
-      #   env:
-      #     OPTIX_VERSION: ${{ matrix.optix-version }}
-      #   run: |
-      #     source /opt/rh/devtoolset-8/enable
+      - name: Linux Build
+        if: runner.os == 'Linux'
+        shell: bash
+        env:
+          OPTIX_VERSION: ${{ matrix.optix-version }}
+        run: |
+          source /opt/rh/devtoolset-8/enable
           
-      #     PY=${{ matrix.python-version }}
-      #     [[ ${PY:2:1} < 8 ]] && M=m || M=""
-      #     PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
-      #     PYEXEC=/opt/python/${PYVER}/bin/python
-      #     CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
+          PY=${{ matrix.python-version }}
+          [[ ${PY:2:1} < 8 ]] && M=m || M=""
+          PYVER=cp${PY:0:1}${PY:2:1}-cp${PY:0:1}${PY:2:1}${M}
+          PYEXEC=/opt/python/${PYVER}/bin/python
+          CMAKEEXEC=/opt/python/${PYVER}/bin/cmake 
           
-      #     cd build 
+          cd build 
           
-      #     $CMAKEEXEC ../ \
-      #     -DCMAKE_CUDA_COMPILER=/usr/local/cuda-10.2/bin/nvcc \
-      #     -DCUDA_CUDA_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/stubs/libcuda.so \
-      #     -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-10.2/bin/nvcc \
-      #     -DCUDA_INCLUDE_DIRS=/usr/local/cuda-10.2/targets/x86_64-linux/include \
-      #     -DCUDA_CUDART_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so \
-      #     -DSWIG_DIR="./swig-4.0.2/share/swig/4.0.2/" \
-      #     -DSWIG_EXECUTABLE="swig-4.0.2/bin/swig" \
-      #     -DPython_INCLUDE_DIRS=/opt/python/${PYVER}/include/python${PY:0:1}.${PY:2:1}${M}/ \
-      #     -DPython_NumPy_INCLUDE_DIRS=/opt/python/${PYVER}/lib/python${PY:0:1}.${PY:2:1}/site-packages/numpy/core/include/ \
-      #     -DPython_VERSION_MAJOR=${PY:0:1} \
-      #     -DPython_VERSION_MINOR=${PY:2:2} \
-      #     -DCMAKE_BUILD_TYPE=Release \
+          $CMAKEEXEC ../ \
+          -DCMAKE_CUDA_COMPILER=/usr/local/cuda-10.2/bin/nvcc \
+          -DCUDA_CUDA_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/stubs/libcuda.so \
+          -DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-10.2/bin/nvcc \
+          -DCUDA_INCLUDE_DIRS=/usr/local/cuda-10.2/targets/x86_64-linux/include \
+          -DCUDA_CUDART_LIBRARY=/usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so \
+          -DSWIG_DIR="./swig-4.0.2/share/swig/4.0.2/" \
+          -DSWIG_EXECUTABLE="swig-4.0.2/bin/swig" \
+          -DPython_INCLUDE_DIRS=/opt/python/${PYVER}/include/python${PY:0:1}.${PY:2:1}${M}/ \
+          -DPython_NumPy_INCLUDE_DIRS=/opt/python/${PYVER}/lib/python${PY:0:1}.${PY:2:1}/site-packages/numpy/core/include/ \
+          -DPython_VERSION_MAJOR=${PY:0:1} \
+          -DPython_VERSION_MINOR=${PY:2:2} \
+          -DCMAKE_BUILD_TYPE=Release \
            
-      #     $CMAKEEXEC --build . --config Release --target install
-      #     cd ..
-      #     cd install
+          $CMAKEEXEC --build . --config Release --target install
+          cd ..
+          cd install
           
-      #     # need to temporarily remove libcuda.so.1 from library
-      #     cd nvisii
-      #     patchelf --remove-needed libcuda.so.1 _nvisii.so
-      #     cd ../
+          # need to temporarily remove libcuda.so.1 from library
+          cd nvisii
+          patchelf --remove-needed libcuda.so.1 _nvisii.so
+          cd ../
           
-      #     # now make the bdistwheel
-      #     $PYEXEC setup.py bdist_wheel
-      #     cd dist
+          # now make the bdistwheel
+          $PYEXEC setup.py bdist_wheel
+          cd dist
           
-      #     # audit the bdistwheel for use on all linux distros
-      #     # and use the same dir for both .py and .so files
-      #     auditwheel repair -L "" *.whl  
+          # audit the bdistwheel for use on all linux distros
+          # and use the same dir for both .py and .so files
+          auditwheel repair -L "" *.whl  
           
-      #     # add back on the libcuda.so.1 to the library
-      #     cd wheelhouse
-      #     unar -d *.whl
-      #     cd nvisii*
-      #     cd nvisii
-      #     patchelf --add-needed libcuda.so.1 _nvisii.so
-      #     # list libraries for manual verification
-      #     ldd _nvisii.so
-      #     cd ..
-      #     cd ..
-      #     rm *.whl
-      #     NAME=$(ls -1 .)
-      #     cd ${NAME}
-      #     zip -r ../${NAME}.whl ./*
-      #     # move the modified manylinux wheel to the "install/dist" folder to be uploaded
-      #     rm ../../*.whl
-      #     cp ../*.whl ../../
+          # add back on the libcuda.so.1 to the library
+          cd wheelhouse
+          unar -d *.whl
+          cd nvisii*
+          cd nvisii
+          patchelf --add-needed libcuda.so.1 _nvisii.so
+          # list libraries for manual verification
+          ldd _nvisii.so
+          cd ..
+          cd ..
+          rm *.whl
+          NAME=$(ls -1 .)
+          cd ${NAME}
+          zip -r ../${NAME}.whl ./*
+          # move the modified manylinux wheel to the "install/dist" folder to be uploaded
+          rm ../../*.whl
+          cp ../*.whl ../../
       
       # Python configuration by github is only valid on their VMs (not inside the manylinux container), so only run on windows.
       - name: Configure Python on Windows

From 86f5ca0886941b09c1425e3c75f138b34a44cb70 Mon Sep 17 00:00:00 2001
From: "Nathan V. Morrical" <natemorrical@gmail.com>
Date: Wed, 5 Jul 2023 14:54:09 -0600
Subject: [PATCH 55/55] Update build.yml

Trying to update to 22.04
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 70d272ca..77c762de 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -24,7 +24,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04, windows-2019]
+        os: [ubuntu-22.04, windows-2019]
         python-version: ['3.6', '3.7', '3.8', '3.9'] # #, '3.10'
         optix-version: [70, 71, 72] # 
         include: