first commit

2023-08-02 19:51:43 -07:00
parent c2891c38cc
commit 13e18567fa
202 changed files with 43362 additions and 17 deletions
--- a/render/renderutils/init.py
+++ b/render/renderutils/init.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+from .ops import xfm_points, xfm_vectors, image_loss, diffuse_cubemap, specular_cubemap, prepare_shading_normal, lambert, frostbite_diffuse, pbr_specular, pbr_bsdf, _fresnel_shlick, _ndf_ggx, _lambda_ggx, _masking_smith
+__all__ = ["xfm_vectors", "xfm_points", "image_loss", "diffuse_cubemap","specular_cubemap", "prepare_shading_normal", "lambert", "frostbite_diffuse", "pbr_specular", "pbr_bsdf", "_fresnel_shlick", "_ndf_ggx", "_lambda_ggx", "_masking_smith", ]
--- a/render/renderutils/bsdf.py
+++ b/render/renderutils/bsdf.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import math
+import torch
+
+NORMAL_THRESHOLD = 0.1
+
+################################################################################
+# Vector utility functions
+################################################################################
+
+def _dot(x, y):
+    return torch.sum(x*y, -1, keepdim=True)
+
+def _reflect(x, n):
+    return 2*_dot(x, n)*n - x
+
+def _safe_normalize(x):
+    return torch.nn.functional.normalize(x, dim = -1)
+
+def _bend_normal(view_vec, smooth_nrm, geom_nrm, two_sided_shading):
+    # Swap normal direction for backfacing surfaces
+    if two_sided_shading:
+        smooth_nrm = torch.where(_dot(geom_nrm, view_vec) > 0, smooth_nrm, -smooth_nrm)
+        geom_nrm   = torch.where(_dot(geom_nrm, view_vec) > 0, geom_nrm, -geom_nrm)
+
+    t = torch.clamp(_dot(view_vec, smooth_nrm) / NORMAL_THRESHOLD, min=0, max=1)
+    return torch.lerp(geom_nrm, smooth_nrm, t)
+
+
+def _perturb_normal(perturbed_nrm, smooth_nrm, smooth_tng, opengl):
+    smooth_bitang = _safe_normalize(torch.cross(smooth_tng, smooth_nrm))
+    if opengl:
+        shading_nrm = smooth_tng * perturbed_nrm[..., 0:1] - smooth_bitang * perturbed_nrm[..., 1:2] + smooth_nrm * torch.clamp(perturbed_nrm[..., 2:3], min=0.0)
+    else:
+        shading_nrm = smooth_tng * perturbed_nrm[..., 0:1] + smooth_bitang * perturbed_nrm[..., 1:2] + smooth_nrm * torch.clamp(perturbed_nrm[..., 2:3], min=0.0)
+    return _safe_normalize(shading_nrm)
+
+def bsdf_prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl):
+    smooth_nrm = _safe_normalize(smooth_nrm)
+    smooth_tng = _safe_normalize(smooth_tng)
+    view_vec   = _safe_normalize(view_pos - pos)
+    shading_nrm = _perturb_normal(perturbed_nrm, smooth_nrm, smooth_tng, opengl)
+    return _bend_normal(view_vec, shading_nrm, geom_nrm, two_sided_shading)
+
+################################################################################
+# Simple lambertian diffuse BSDF
+################################################################################
+
+def bsdf_lambert(nrm, wi):
+    return torch.clamp(_dot(nrm, wi), min=0.0) / math.pi
+
+################################################################################
+# Frostbite diffuse
+################################################################################
+
+def bsdf_frostbite(nrm, wi, wo, linearRoughness):
+    wiDotN = _dot(wi, nrm)
+    woDotN = _dot(wo, nrm)
+
+    h = _safe_normalize(wo + wi)
+    wiDotH = _dot(wi, h)
+
+    energyBias = 0.5 * linearRoughness
+    energyFactor = 1.0 - (0.51 / 1.51) * linearRoughness
+    f90 = energyBias + 2.0 * wiDotH * wiDotH * linearRoughness
+    f0 = 1.0
+
+    wiScatter = bsdf_fresnel_shlick(f0, f90, wiDotN)
+    woScatter = bsdf_fresnel_shlick(f0, f90, woDotN)
+    res = wiScatter * woScatter * energyFactor
+    return torch.where((wiDotN > 0.0) & (woDotN > 0.0), res, torch.zeros_like(res))
+
+################################################################################
+# Phong specular, loosely based on mitsuba implementation
+################################################################################
+
+def bsdf_phong(nrm, wo, wi, N):
+    dp_r = torch.clamp(_dot(_reflect(wo, nrm), wi), min=0.0, max=1.0)
+    dp_l = torch.clamp(_dot(nrm, wi), min=0.0, max=1.0)
+    return (dp_r ** N) * dp_l * (N + 2) / (2 * math.pi)
+
+################################################################################
+# PBR's implementation of GGX specular
+################################################################################
+
+specular_epsilon = 1e-4
+
+def bsdf_fresnel_shlick(f0, f90, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    return f0 + (f90 - f0) * (1.0 - _cosTheta) ** 5.0
+
+def bsdf_ndf_ggx(alphaSqr, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1
+    return alphaSqr / (d * d * math.pi)
+
+def bsdf_lambda_ggx(alphaSqr, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    cosThetaSqr = _cosTheta * _cosTheta
+    tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr
+    res = 0.5 * (torch.sqrt(1 + alphaSqr * tanThetaSqr) - 1.0)
+    return res
+
+def bsdf_masking_smith_ggx_correlated(alphaSqr, cosThetaI, cosThetaO):
+    lambdaI = bsdf_lambda_ggx(alphaSqr, cosThetaI)
+    lambdaO = bsdf_lambda_ggx(alphaSqr, cosThetaO)
+    return 1 / (1 + lambdaI + lambdaO)
+
+def bsdf_pbr_specular(col, nrm, wo, wi, alpha, min_roughness=0.08):
+    _alpha = torch.clamp(alpha, min=min_roughness*min_roughness, max=1.0)
+    alphaSqr = _alpha * _alpha
+
+    h = _safe_normalize(wo + wi)
+    woDotN = _dot(wo, nrm)
+    wiDotN = _dot(wi, nrm)
+    woDotH = _dot(wo, h)
+    nDotH  = _dot(nrm, h)
+
+    D = bsdf_ndf_ggx(alphaSqr, nDotH)
+    G = bsdf_masking_smith_ggx_correlated(alphaSqr, woDotN, wiDotN)
+    F = bsdf_fresnel_shlick(col, 1, woDotH)
+
+    w = F * D * G * 0.25 / torch.clamp(woDotN, min=specular_epsilon)
+
+    frontfacing = (woDotN > specular_epsilon) & (wiDotN > specular_epsilon)
+    return torch.where(frontfacing, w, torch.zeros_like(w))
+
+def bsdf_pbr(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF):
+    wo = _safe_normalize(view_pos - pos)
+    wi = _safe_normalize(light_pos - pos)
+
+    spec_str  = arm[..., 0:1] # x component
+    roughness = arm[..., 1:2] # y component
+    metallic  = arm[..., 2:3] # z component
+    ks = (0.04 * (1.0 - metallic) + kd * metallic) * (1 - spec_str)
+    kd = kd * (1.0 - metallic)
+
+    if BSDF == 0:
+        diffuse = kd * bsdf_lambert(nrm, wi)
+    else:
+        diffuse = kd * bsdf_frostbite(nrm, wi, wo, roughness)
+    specular = bsdf_pbr_specular(ks, nrm, wo, wi, roughness*roughness, min_roughness=min_roughness)
+    return diffuse + specular
--- a/render/renderutils/c_src/bsdf.cu
+++ b/render/renderutils/c_src/bsdf.cu
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#include "common.h"
+#include "bsdf.h"
+
+#define SPECULAR_EPSILON 1e-4f
+
+//------------------------------------------------------------------------
+// Lambert functions
+
+__device__ inline float fwdLambert(const vec3f nrm, const vec3f wi)
+{
+    return max(dot(nrm, wi) / M_PI, 0.0f);
+}
+
+__device__ inline void bwdLambert(const vec3f nrm, const vec3f wi, vec3f& d_nrm, vec3f& d_wi, const float d_out)
+{
+    if (dot(nrm, wi) > 0.0f)
+        bwdDot(nrm, wi, d_nrm, d_wi, d_out / M_PI);
+}
+
+//------------------------------------------------------------------------
+// Fresnel Schlick 
+
+__device__ inline float fwdFresnelSchlick(const float f0, const float f90, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = powf(1.0f - _cosTheta, 5.0f);
+    return f0 * (1.0f - scale) + f90 * scale;
+}
+
+__device__ inline void bwdFresnelSchlick(const float f0, const float f90, const float cosTheta, float& d_f0, float& d_f90, float& d_cosTheta, const float d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
+    d_f0 += d_out * (1.0 - scale);
+    d_f90 += d_out * scale;
+    if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f);
+    }
+}
+
+__device__ inline vec3f fwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = powf(1.0f - _cosTheta, 5.0f);
+    return f0 * (1.0f - scale) + f90 * scale;
+}
+
+__device__ inline void bwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta, vec3f& d_f0, vec3f& d_f90, float& d_cosTheta, const vec3f d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
+    d_f0 += d_out * (1.0 - scale);
+    d_f90 += d_out * scale;
+    if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += sum(d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f));
+    }
+}
+
+//------------------------------------------------------------------------
+// Frostbite diffuse
+
+__device__ inline float fwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness)
+{
+    float wiDotN = dot(wi, nrm);
+    float woDotN = dot(wo, nrm);
+    if (wiDotN > 0.0f && woDotN > 0.0f)
+    {
+        vec3f h = safeNormalize(wo + wi);
+        float wiDotH = dot(wi, h);
+
+        float energyBias = 0.5f * linearRoughness;
+        float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float f0 = 1.f;
+        
+        float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
+        float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        
+        return wiScatter * woScatter * energyFactor;
+    }
+    else return 0.0f;
+}
+
+__device__ inline void bwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness, vec3f& d_nrm, vec3f& d_wi, vec3f& d_wo, float &d_linearRoughness, const float d_out)
+{
+    float wiDotN = dot(wi, nrm);
+    float woDotN = dot(wo, nrm);
+
+    if (wiDotN > 0.0f && woDotN > 0.0f)
+    {
+        vec3f h = safeNormalize(wo + wi);
+        float wiDotH = dot(wi, h);
+
+        float energyBias = 0.5f * linearRoughness;
+        float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float f0 = 1.f;
+        
+        float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
+        float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+
+        // -------------- BWD --------------
+        // Backprop: return wiScatter * woScatter * energyFactor;
+        float d_wiScatter = d_out * woScatter * energyFactor;
+        float d_woScatter = d_out * wiScatter * energyFactor;
+        float d_energyFactor = d_out * wiScatter * woScatter; 
+
+        // Backprop: float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        float d_woDotN = 0.0f, d_f0 = 0.0, d_f90 = 0.0f;
+        bwdFresnelSchlick(f0, f90, woDotN, d_f0, d_f90, d_woDotN, d_woScatter);
+
+        // Backprop: float wiScatter = fwdFresnelSchlick(fd0, fd90, wiDotN);
+        float d_wiDotN = 0.0f;
+        bwdFresnelSchlick(f0, f90, wiDotN, d_f0, d_f90, d_wiDotN, d_wiScatter);
+
+        // Backprop: float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float d_energyBias = d_f90;
+        float d_wiDotH = d_f90 * 4 * wiDotH * linearRoughness;
+        d_linearRoughness += d_f90 * 2 * wiDotH * wiDotH;
+
+        // Backprop: float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        d_linearRoughness -= (0.51f / 1.51f) * d_energyFactor;
+
+        // Backprop: float energyBias = 0.5f * linearRoughness;
+        d_linearRoughness += 0.5 * d_energyBias;
+
+        // Backprop: float wiDotH = dot(wi, h);
+        vec3f d_h(0);
+        bwdDot(wi, h, d_wi, d_h, d_wiDotH);
+
+        // Backprop: vec3f h = safeNormalize(wo + wi);     
+        vec3f d_wo_wi(0);
+        bwdSafeNormalize(wo + wi, d_wo_wi, d_h);
+        d_wi += d_wo_wi; d_wo += d_wo_wi;
+
+        bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
+        bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
+    }
+}
+
+//------------------------------------------------------------------------
+// Ndf GGX
+
+__device__ inline float fwdNdfGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
+    return alphaSqr / (d * d * M_PI);
+}
+
+__device__ inline void bwdNdfGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
+{
+    // Torch only back propagates if clamp doesn't trigger
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    d_alphaSqr += d_out * (1.0f - (alphaSqr + 1.0f) * cosThetaSqr) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
+    if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += d_out * -(4.0f * (alphaSqr - 1.0f) * alphaSqr * cosTheta) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
+    }
+}
+
+//------------------------------------------------------------------------
+// Lambda GGX
+
+__device__ inline float fwdLambdaGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
+    float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
+    return res;
+}
+
+__device__ inline void bwdLambdaGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
+    float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
+
+    d_alphaSqr += d_out * (0.25 * tanThetaSqr) / sqrtf(alphaSqr * tanThetaSqr + 1.0f);
+    if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+        d_cosTheta += d_out * -(0.5 * alphaSqr) / (powf(_cosTheta, 3.0f) * sqrtf(alphaSqr / cosThetaSqr - alphaSqr + 1.0f));
+}
+
+//------------------------------------------------------------------------
+// Masking GGX
+
+__device__ inline float fwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO)
+{
+    float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
+    float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
+    return 1.0f / (1.0f + lambdaI + lambdaO);
+}
+
+__device__ inline void bwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO, float& d_alphaSqr, float& d_cosThetaI, float& d_cosThetaO, const float d_out)
+{
+    // FWD eval
+    float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
+    float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
+
+    // BWD eval
+    float d_lambdaIO = -d_out / powf(1.0f + lambdaI + lambdaO, 2.0f);
+    bwdLambdaGGX(alphaSqr, cosThetaI, d_alphaSqr, d_cosThetaI, d_lambdaIO);
+    bwdLambdaGGX(alphaSqr, cosThetaO, d_alphaSqr, d_cosThetaO, d_lambdaIO);
+}
+
+//------------------------------------------------------------------------
+// GGX specular
+
+__device__ vec3f fwdPbrSpecular(const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness)
+{
+    float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
+    float alphaSqr = _alpha * _alpha;
+
+    vec3f h = safeNormalize(wo + wi);
+    float woDotN = dot(wo, nrm);
+    float wiDotN = dot(wi, nrm);
+    float woDotH = dot(wo, h);
+    float nDotH = dot(nrm, h);
+
+    float D = fwdNdfGGX(alphaSqr, nDotH);
+    float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
+    vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
+    vec3f w = F * D * G * 0.25 / woDotN;
+
+    bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
+    return frontfacing ? w : 0.0f;
+}
+
+__device__ void bwdPbrSpecular(
+    const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness,
+    vec3f& d_col, vec3f& d_nrm, vec3f& d_wo, vec3f& d_wi, float& d_alpha, const vec3f d_out)
+{
+    ///////////////////////////////////////////////////////////////////////
+    // FWD eval
+
+    float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
+    float alphaSqr = _alpha * _alpha;
+
+    vec3f h = safeNormalize(wo + wi);
+    float woDotN = dot(wo, nrm);
+    float wiDotN = dot(wi, nrm);
+    float woDotH = dot(wo, h);
+    float nDotH = dot(nrm, h);
+
+    float D = fwdNdfGGX(alphaSqr, nDotH);
+    float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
+    vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
+    vec3f w = F * D * G * 0.25 / woDotN;
+    bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
+
+    if (frontfacing)
+    {
+        ///////////////////////////////////////////////////////////////////////
+        // BWD eval
+
+        vec3f d_F = d_out * D * G * 0.25f / woDotN;
+        float d_D = sum(d_out * F * G * 0.25f / woDotN);
+        float d_G = sum(d_out * F * D * 0.25f / woDotN);
+
+        float d_woDotN = -sum(d_out * F * D * G * 0.25f / (woDotN * woDotN));
+
+        vec3f d_f90(0);
+        float d_woDotH(0), d_wiDotN(0), d_nDotH(0), d_alphaSqr(0);
+        bwdFresnelSchlick(col, 1.0f, woDotH, d_col, d_f90, d_woDotH, d_F);
+        bwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN, d_alphaSqr, d_woDotN, d_wiDotN, d_G);
+        bwdNdfGGX(alphaSqr, nDotH, d_alphaSqr, d_nDotH, d_D);
+
+        vec3f d_h(0);
+        bwdDot(nrm, h, d_nrm, d_h, d_nDotH);
+        bwdDot(wo, h, d_wo, d_h, d_woDotH);
+        bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
+        bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
+
+        vec3f d_h_unnorm(0);
+        bwdSafeNormalize(wo + wi, d_h_unnorm, d_h);
+        d_wo += d_h_unnorm;
+        d_wi += d_h_unnorm;
+
+        if (alpha > min_roughness * min_roughness)
+            d_alpha += d_alphaSqr * 2 * alpha;
+    }
+}
+
+//------------------------------------------------------------------------
+// Full PBR BSDF
+
+__device__ vec3f fwdPbrBSDF(const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF)
+{
+    vec3f wo = safeNormalize(view_pos - pos);
+    vec3f wi = safeNormalize(light_pos - pos);
+
+    float alpha = arm.y * arm.y;
+    vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
+    vec3f diff_col = kd * (1.0f - arm.z);
+
+    float diff = 0.0f;
+    if (BSDF == 0)
+        diff = fwdLambert(nrm, wi);
+    else
+        diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);    
+    vec3f diffuse = diff_col * diff;
+    vec3f specular = fwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness);
+
+    return diffuse + specular;
+}
+
+__device__ void bwdPbrBSDF(
+    const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF,
+    vec3f& d_kd, vec3f& d_arm, vec3f& d_pos, vec3f& d_nrm, vec3f& d_view_pos, vec3f& d_light_pos, const vec3f d_out)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f _wi = light_pos - pos;
+    vec3f _wo = view_pos - pos;
+    vec3f wi = safeNormalize(_wi);
+    vec3f wo = safeNormalize(_wo);
+
+    float alpha = arm.y * arm.y;
+    vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
+    vec3f diff_col = kd * (1.0f - arm.z);
+    float diff = 0.0f;
+    if (BSDF == 0)
+        diff = fwdLambert(nrm, wi);
+    else
+        diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);    
+
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+
+    float d_alpha(0);
+    vec3f d_spec_col(0), d_wi(0), d_wo(0);
+    bwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness, d_spec_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
+
+    float d_diff = sum(diff_col * d_out);
+    if (BSDF == 0)
+        bwdLambert(nrm, wi, d_nrm, d_wi, d_diff);
+    else
+        bwdFrostbiteDiffuse(nrm, wi, wo, arm.y, d_nrm, d_wi, d_wo, d_arm.y, d_diff);    
+
+    // Backprop: diff_col = kd * (1.0f - arm.z)
+    vec3f d_diff_col = d_out * diff;
+    d_kd += d_diff_col * (1.0f - arm.z);
+    d_arm.z -= sum(d_diff_col * kd);
+
+    // Backprop: spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x)
+    d_kd -= d_spec_col * (arm.x - 1.0f) * arm.z;
+    d_arm.x += sum(d_spec_col * (arm.z * (0.04f - kd) - 0.04f));
+    d_arm.z -= sum(d_spec_col * (kd - 0.04f) * (arm.x - 1.0f));
+
+    // Backprop: alpha = arm.y * arm.y
+    d_arm.y += d_alpha * 2 * arm.y;
+
+    // Backprop: vec3f wi = safeNormalize(light_pos - pos);
+    vec3f d__wi(0);
+    bwdSafeNormalize(_wi, d__wi, d_wi);
+    d_light_pos += d__wi;
+    d_pos -= d__wi;
+
+    // Backprop: vec3f wo = safeNormalize(view_pos - pos);
+    vec3f d__wo(0);
+    bwdSafeNormalize(_wo, d__wo, d_wo);
+    d_view_pos += d__wo;
+    d_pos -= d__wo;
+}
+
+//------------------------------------------------------------------------
+// Kernels
+
+__global__ void LambertFwdKernel(LambertKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+
+    float res = fwdLambert(nrm, wi);
+
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void LambertBwdKernel(LambertKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+
+    vec3f d_nrm(0), d_wi(0);
+    bwdLambert(nrm, wi, d_nrm, d_wi, d_out);
+
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wi.store_grad(px, py, pz, d_wi);
+}
+
+__global__ void FrostbiteDiffuseFwdKernel(FrostbiteDiffuseKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
+
+    float res = fwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness);
+
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void FrostbiteDiffuseBwdKernel(FrostbiteDiffuseKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+
+    float d_linearRoughness = 0.0f;
+    vec3f d_nrm(0), d_wi(0), d_wo(0);
+    bwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness, d_nrm, d_wi, d_wo, d_linearRoughness, d_out);
+
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wi.store_grad(px, py, pz, d_wi);
+    p.wo.store_grad(px, py, pz, d_wo);
+    p.linearRoughness.store_grad(px, py, pz, d_linearRoughness);
+}
+
+__global__ void FresnelShlickFwdKernel(FresnelShlickKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f f0 = p.f0.fetch3(px, py, pz);
+    vec3f f90 = p.f90.fetch3(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+
+    vec3f res = fwdFresnelSchlick(f0, f90, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void FresnelShlickBwdKernel(FresnelShlickKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f f0 = p.f0.fetch3(px, py, pz);
+    vec3f f90 = p.f90.fetch3(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+
+    vec3f d_f0(0), d_f90(0);
+    float d_cosTheta(0);
+    bwdFresnelSchlick(f0, f90, cosTheta, d_f0, d_f90, d_cosTheta, d_out);
+
+    p.f0.store_grad(px, py, pz, d_f0);
+    p.f90.store_grad(px, py, pz, d_f90);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+
+__global__ void ndfGGXFwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float res = fwdNdfGGX(alphaSqr, cosTheta);
+    
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void ndfGGXBwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+
+    float d_alphaSqr(0), d_cosTheta(0);
+    bwdNdfGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
+
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+
+__global__ void lambdaGGXFwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float res = fwdLambdaGGX(alphaSqr, cosTheta);
+
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void lambdaGGXBwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+
+    float d_alphaSqr(0), d_cosTheta(0);
+    bwdLambdaGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
+
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+
+__global__ void maskingSmithFwdKernel(MaskingSmithParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
+    float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
+    float res = fwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO);
+    
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void maskingSmithBwdKernel(MaskingSmithParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
+    float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+
+    float d_alphaSqr(0), d_cosThetaI(0), d_cosThetaO(0);
+    bwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO, d_alphaSqr, d_cosThetaI, d_cosThetaO, d_out);
+
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosThetaI.store_grad(px, py, pz, d_cosThetaI);
+    p.cosThetaO.store_grad(px, py, pz, d_cosThetaO);
+}
+
+__global__ void pbrSpecularFwdKernel(PbrSpecular p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f col = p.col.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float alpha = p.alpha.fetch1(px, py, pz);
+
+    vec3f res = fwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness);
+
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void pbrSpecularBwdKernel(PbrSpecular p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f col = p.col.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float alpha = p.alpha.fetch1(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+
+    float d_alpha(0);
+    vec3f d_col(0), d_nrm(0), d_wo(0), d_wi(0);
+    bwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness, d_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
+
+    p.col.store_grad(px, py, pz, d_col);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wo.store_grad(px, py, pz, d_wo);
+    p.wi.store_grad(px, py, pz, d_wi);
+    p.alpha.store_grad(px, py, pz, d_alpha);
+}
+
+__global__ void pbrBSDFFwdKernel(PbrBSDF p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f kd = p.kd.fetch3(px, py, pz);
+    vec3f arm = p.arm.fetch3(px, py, pz);
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f light_pos = p.light_pos.fetch3(px, py, pz);
+
+    vec3f res = fwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF);
+
+    p.out.store(px, py, pz, res);
+}
+__global__ void pbrBSDFBwdKernel(PbrBSDF p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f kd = p.kd.fetch3(px, py, pz);
+    vec3f arm = p.arm.fetch3(px, py, pz);
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f light_pos = p.light_pos.fetch3(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+
+    vec3f d_kd(0), d_arm(0), d_pos(0), d_nrm(0), d_view_pos(0), d_light_pos(0);
+    bwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF, d_kd, d_arm, d_pos, d_nrm, d_view_pos, d_light_pos, d_out);
+
+    p.kd.store_grad(px, py, pz, d_kd);
+    p.arm.store_grad(px, py, pz, d_arm);
+    p.pos.store_grad(px, py, pz, d_pos);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.view_pos.store_grad(px, py, pz, d_view_pos);
+    p.light_pos.store_grad(px, py, pz, d_light_pos);
+}
--- a/render/renderutils/c_src/bsdf.h
+++ b/render/renderutils/c_src/bsdf.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct LambertKernelParams
+{
+    Tensor  nrm;
+    Tensor  wi;
+    Tensor  out;
+    dim3    gridSize;
+};
+
+struct FrostbiteDiffuseKernelParams
+{
+    Tensor  nrm;
+    Tensor  wi;
+    Tensor  wo;
+    Tensor  linearRoughness;
+    Tensor  out;
+    dim3    gridSize;
+};
+
+struct FresnelShlickKernelParams
+{
+    Tensor  f0;
+    Tensor  f90;
+    Tensor  cosTheta;
+    Tensor  out;
+    dim3    gridSize;
+};
+
+struct NdfGGXParams
+{
+    Tensor  alphaSqr;
+    Tensor  cosTheta;
+    Tensor  out;
+    dim3    gridSize;
+};
+
+struct MaskingSmithParams
+{
+    Tensor  alphaSqr;
+    Tensor  cosThetaI;
+    Tensor  cosThetaO;
+    Tensor  out;
+    dim3    gridSize;
+};
+
+struct PbrSpecular
+{
+    Tensor  col;
+    Tensor  nrm;
+    Tensor  wo;
+    Tensor  wi;
+    Tensor  alpha;
+    Tensor  out;
+    dim3    gridSize;
+    float   min_roughness;
+};
+
+struct PbrBSDF
+{
+    Tensor  kd;
+    Tensor  arm;
+    Tensor  pos;
+    Tensor  nrm;
+    Tensor  view_pos;
+    Tensor  light_pos;
+    Tensor  out;
+    dim3    gridSize;
+    float   min_roughness;
+    int     BSDF;
+};
--- a/render/renderutils/c_src/common.cpp
+++ b/render/renderutils/c_src/common.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda_runtime.h>
+#include <algorithm>
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (dims.x * dims.y) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+
+    // Optimizations for weirdly sized buffers.
+    if (dims.x < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= dims.x)
+            bw >>= 1;
+
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > dims.y)
+            bh = dims.y;
+    }
+    else if (dims.y < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > dims.y)
+        {
+            bh >>= 1;
+            if (bw < dims.x)
+                bw <<= 1;
+        }
+    }
+
+    // Done.
+    return dim3(bw, bh, 1);
+}
+
+// returns the size of a block that can be reduced using horizontal SIMD operations (e.g. __shfl_xor_sync)
+dim3 getWarpSize(dim3 blockSize)
+{
+    return dim3(
+        std::min(blockSize.x, 32u), 
+        std::min(std::max(32u / blockSize.x, 1u), std::min(32u, blockSize.y)), 
+        std::min(std::max(32u / (blockSize.x * blockSize.y), 1u), std::min(32u, blockSize.z))
+    );
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, dim3 dims)
+{
+    dim3 gridSize;
+    gridSize.x = (dims.x  - 1) / blockSize.x + 1;
+    gridSize.y = (dims.y - 1) / blockSize.y + 1;
+    gridSize.z = (dims.z  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+
+//------------------------------------------------------------------------
--- a/render/renderutils/c_src/common.h
+++ b/render/renderutils/c_src/common.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+
+#include "vec3f.h"
+#include "vec4f.h"
+#include "tensor.h"
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims);
+dim3 getLaunchGridSize(dim3 blockSize, dim3 dims);
+
+#ifdef __CUDACC__
+
+#ifdef _MSC_VER
+#define M_PI 3.14159265358979323846f
+#endif
+
+__host__ __device__ static inline dim3 getWarpSize(dim3 blockSize)
+{
+    return dim3(
+        min(blockSize.x, 32u),
+        min(max(32u / blockSize.x, 1u), min(32u, blockSize.y)),
+        min(max(32u / (blockSize.x * blockSize.y), 1u), min(32u, blockSize.z))
+    );
+}
+
+__device__ static inline float clamp(float val, float mn, float mx) { return min(max(val, mn), mx); }
+#else
+dim3 getWarpSize(dim3 blockSize);
+#endif
--- a/render/renderutils/c_src/cubemap.cu
+++ b/render/renderutils/c_src/cubemap.cu
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#include "common.h"
+#include "cubemap.h"
+#include <float.h>
+
+// https://cgvr.cs.uni-bremen.de/teaching/cg_literatur/Spherical,%20Cubic,%20and%20Parabolic%20Environment%20Mappings.pdf
+__device__ float pixel_area(int x, int y, int N)
+{
+    if (N > 1)
+    {
+        int H = N / 2;
+        x = abs(x - H);
+        y = abs(y - H);
+        float dx = atan((float)(x + 1) / (float)H) - atan((float)x / (float)H);
+        float dy = atan((float)(y + 1) / (float)H) - atan((float)y / (float)H);
+        return dx * dy;
+    }
+    else
+        return 1;
+}
+
+__device__ vec3f cube_to_dir(int x, int y, int side, int N)
+{
+    float fx = 2.0f * (((float)x + 0.5f) / (float)N) - 1.0f;
+    float fy = 2.0f * (((float)y + 0.5f) / (float)N) - 1.0f;
+    switch (side)
+    {
+        case 0: return safeNormalize(vec3f(1, -fy, -fx));
+        case 1: return safeNormalize(vec3f(-1, -fy, fx));
+        case 2: return safeNormalize(vec3f(fx, 1, fy));
+        case 3: return safeNormalize(vec3f(fx, -1, -fy));
+        case 4: return safeNormalize(vec3f(fx, -fy, 1));
+        case 5: return safeNormalize(vec3f(-fx, -fy, -1));
+    }
+    return vec3f(0,0,0); // Unreachable
+}
+
+__device__ vec3f dir_to_side(int side, vec3f v)
+{
+    switch (side)
+    {
+    case 0: return vec3f(-v.z, -v.y,  v.x);
+    case 1: return vec3f( v.z, -v.y, -v.x);
+    case 2: return vec3f( v.x,  v.z,  v.y);
+    case 3: return vec3f( v.x, -v.z, -v.y);
+    case 4: return vec3f( v.x, -v.y,  v.z);
+    case 5: return vec3f(-v.x, -v.y, -v.z);
+    }
+    return vec3f(0,0,0); // Unreachable
+}
+
+__device__ void extents_1d(float x, float z, float theta, float& _min, float& _max)
+{
+    float l = sqrtf(x * x + z * z);
+    float pxr = x + z * tan(theta) * l, pzr = z - x * tan(theta) * l;
+    float pxl = x - z * tan(theta) * l, pzl = z + x * tan(theta) * l;
+    if (pzl <= 0.00001f)
+        _min = pxl > 0.0f ? FLT_MAX : -FLT_MAX;
+    else
+        _min = pxl / pzl;
+    if (pzr <= 0.00001f)
+        _max = pxr > 0.0f ? FLT_MAX : -FLT_MAX;
+    else
+        _max = pxr / pzr;
+}
+
+__device__ void dir_extents(int side, int N, vec3f v, float theta, int &_xmin, int& _xmax, int& _ymin, int& _ymax)
+{
+    vec3f c = dir_to_side(side, v); // remap to (x,y,z) where side is at z = 1
+
+    if (theta < 0.785398f) // PI/4
+    {
+        float xmin, xmax, ymin, ymax;
+        extents_1d(c.x, c.z, theta, xmin, xmax);
+        extents_1d(c.y, c.z, theta, ymin, ymax);
+
+        if (xmin > 1.0f || xmax < -1.0f || ymin > 1.0f || ymax < -1.0f)
+        {
+            _xmin = -1; _xmax = -1; _ymin = -1; _ymax = -1; // Bad aabb
+        }
+        else
+        {
+            _xmin = (int)min(max((xmin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _xmax = (int)min(max((xmax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _ymin = (int)min(max((ymin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _ymax = (int)min(max((ymax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+        }
+    }
+    else
+    {
+            _xmin = 0.0f;
+            _xmax = (float)(N-1);
+            _ymin = 0.0f;
+            _ymax = (float)(N-1);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Diffuse kernel
+__global__ void DiffuseCubemapFwdKernel(DiffuseCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    int Npx = p.cubemap.dims[1];
+    vec3f N = cube_to_dir(px, py, pz, Npx);
+
+    vec3f col(0);
+
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        for (int y = 0; y < Npx; ++y)
+        {
+            for (int x = 0; x < Npx; ++x)
+            {
+                vec3f L = cube_to_dir(x, y, s, Npx);
+                float costheta = min(max(dot(N, L), 0.0f), 0.999f);
+                float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
+                col += p.cubemap.fetch3(x, y, s) * w;
+            }
+        }
+    }
+
+    p.out.store(px, py, pz, col);
+}
+
+__global__ void DiffuseCubemapBwdKernel(DiffuseCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    int Npx = p.cubemap.dims[1];
+    vec3f N = cube_to_dir(px, py, pz, Npx);
+    vec3f grad = p.out.fetch3(px, py, pz);
+
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        for (int y = 0; y < Npx; ++y)
+        {
+            for (int x = 0; x < Npx; ++x)
+            {
+                vec3f L = cube_to_dir(x, y, s, Npx);
+                float costheta = min(max(dot(N, L), 0.0f), 0.999f);
+                float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
+            }
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// GGX splitsum kernel 
+
+__device__ inline float ndfGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, 0.0, 1.0f);
+    float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
+    return alphaSqr / (d * d * M_PI);
+}
+
+__global__ void SpecularBoundsKernel(SpecularBoundsKernelParams p)
+{
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    int Npx = p.gridSize.x;
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+
+    const int TILE_SIZE = 16;
+
+    // Brute force entire cubemap and compute bounds for the cone
+    for (int s = 0; s < p.gridSize.z; ++s)
+    {
+        // Assume empty BBox 
+        int _min_x = p.gridSize.x - 1, _max_x = 0;
+        int _min_y = p.gridSize.y - 1, _max_y = 0;
+        
+        // For each (8x8) tile
+        for (int tx = 0; tx < (p.gridSize.x + TILE_SIZE - 1) / TILE_SIZE; tx++)
+        {
+            for (int ty = 0; ty < (p.gridSize.y + TILE_SIZE - 1) / TILE_SIZE; ty++)
+            {
+                // Compute tile extents
+                int tsx = tx * TILE_SIZE, tsy = ty * TILE_SIZE;
+                int tex = min((tx + 1) * TILE_SIZE, p.gridSize.x), tey = min((ty + 1) * TILE_SIZE, p.gridSize.y);
+
+                // Use some blunt interval arithmetics to cull tiles
+                vec3f L0 = cube_to_dir(tsx, tsy, s, Npx), L1 = cube_to_dir(tex, tsy, s, Npx);
+                vec3f L2 = cube_to_dir(tsx, tey, s, Npx), L3 = cube_to_dir(tex, tey, s, Npx);
+                
+                float minx = min(min(L0.x, L1.x), min(L2.x, L3.x)), maxx = max(max(L0.x, L1.x), max(L2.x, L3.x));
+                float miny = min(min(L0.y, L1.y), min(L2.y, L3.y)), maxy = max(max(L0.y, L1.y), max(L2.y, L3.y));
+                float minz = min(min(L0.z, L1.z), min(L2.z, L3.z)), maxz = max(max(L0.z, L1.z), max(L2.z, L3.z));
+
+                float maxdp = max(minx * VNR.x, maxx * VNR.x) + max(miny * VNR.y, maxy * VNR.y) + max(minz * VNR.z, maxz * VNR.z);
+                if (maxdp >= p.costheta_cutoff)
+                {
+                    // Test all pixels in tile.
+                    for (int y = tsy; y < tey; ++y)
+                    {
+                        for (int x = tsx; x < tex; ++x)
+                        {
+                            vec3f L = cube_to_dir(x, y, s, Npx);
+                            if (dot(L, VNR) >= p.costheta_cutoff)
+                            {
+                                _min_x = min(_min_x, x);
+                                _max_x = max(_max_x, x);
+                                _min_y = min(_min_y, y);
+                                _max_y = max(_max_y, y);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 0), _min_x);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 1), _max_x);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 2), _min_y);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 3), _max_y);
+    }
+}
+
+__global__ void SpecularCubemapFwdKernel(SpecularCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    int Npx = p.cubemap.dims[1];
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+
+    float alpha = p.roughness * p.roughness;
+    float alphaSqr = alpha * alpha;
+
+    float wsum = 0.0f;
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        int xmin, xmax, ymin, ymax;
+        xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
+        xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
+        ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
+        ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
+
+        if (xmin <= xmax)
+        {
+            for (int y = ymin; y <= ymax; ++y)
+            {
+                for (int x = xmin; x <= xmax; ++x)
+                {
+                    vec3f L = cube_to_dir(x, y, s, Npx);
+                    if (dot(L, VNR) >= p.costheta_cutoff)
+                    {
+                        vec3f H = safeNormalize(L + VNR);
+
+                        float wiDotN = max(dot(L, VNR), 0.0f);
+                        float VNRDotH = max(dot(VNR, H), 0.0f);
+
+                        float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
+                        col += p.cubemap.fetch3(x, y, s) * w;
+                        wsum += w;
+                    }
+                }
+            }
+        }
+    }
+
+    p.out.store(p.out._nhwcIndex(pz, py, px, 0), col.x);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 1), col.y);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 2), col.z);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 3), wsum);
+}
+
+__global__ void SpecularCubemapBwdKernel(SpecularCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    int Npx = p.cubemap.dims[1];
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+
+    vec3f grad = p.out.fetch3(px, py, pz);
+
+    float alpha = p.roughness * p.roughness;
+    float alphaSqr = alpha * alpha;
+
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        int xmin, xmax, ymin, ymax;
+        xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
+        xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
+        ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
+        ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
+
+        if (xmin <= xmax)
+        {
+            for (int y = ymin; y <= ymax; ++y)
+            {
+                for (int x = xmin; x <= xmax; ++x)
+                {
+                    vec3f L = cube_to_dir(x, y, s, Npx);
+                    if (dot(L, VNR) >= p.costheta_cutoff)
+                    {
+                        vec3f H = safeNormalize(L + VNR);
+
+                        float wiDotN = max(dot(L, VNR), 0.0f);
+                        float VNRDotH = max(dot(VNR, H), 0.0f);
+
+                        float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
+
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
+                    }
+                }
+            }
+        }
+    }
+}
--- a/render/renderutils/c_src/cubemap.h
+++ b/render/renderutils/c_src/cubemap.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct DiffuseCubemapKernelParams
+{
+    Tensor  cubemap;
+    Tensor  out;
+    dim3    gridSize;
+};
+
+struct SpecularCubemapKernelParams
+{
+    Tensor  cubemap;
+    Tensor  bounds;
+    Tensor  out;
+    dim3    gridSize;
+    float   costheta_cutoff;
+    float   roughness;
+};
+
+struct SpecularBoundsKernelParams
+{
+    float   costheta_cutoff;
+    Tensor  out;
+    dim3    gridSize;
+};
--- a/render/renderutils/c_src/loss.cu
+++ b/render/renderutils/c_src/loss.cu
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda.h>
+
+#include "common.h"
+#include "loss.h"
+
+//------------------------------------------------------------------------
+// Utils
+
+__device__ inline float bwdAbs(float x) { return x == 0.0f ? 0.0f : x < 0.0f ? -1.0f : 1.0f; }
+
+__device__ float warpSum(float val) {
+    for (int i = 1; i < 32; i *= 2)
+        val += __shfl_xor_sync(0xFFFFFFFF, val, i);
+    return val;
+}
+
+//------------------------------------------------------------------------
+// Tonemapping
+
+__device__ inline float fwdSRGB(float x)
+{
+    return x > 0.0031308f ? powf(max(x, 0.0031308f), 1.0f / 2.4f) * 1.055f - 0.055f : 12.92f * max(x, 0.0f);
+}
+
+__device__ inline void bwdSRGB(float x, float &d_x, float d_out)
+{
+    if (x > 0.0031308f)
+        d_x += d_out * 0.439583f / powf(x, 0.583333f);
+    else if (x > 0.0f)
+        d_x += d_out * 12.92f;
+}
+
+__device__ inline vec3f fwdTonemapLogSRGB(vec3f x)
+{
+    return vec3f(fwdSRGB(logf(x.x + 1.0f)), fwdSRGB(logf(x.y + 1.0f)), fwdSRGB(logf(x.z + 1.0f)));
+}
+
+__device__ inline void bwdTonemapLogSRGB(vec3f x, vec3f& d_x, vec3f d_out)
+{
+    if (x.x > 0.0f && x.x < 65535.0f)
+    {
+        bwdSRGB(logf(x.x + 1.0f), d_x.x, d_out.x);
+        d_x.x *= 1 / (x.x + 1.0f);
+    }
+    if (x.y > 0.0f && x.y < 65535.0f)
+    {
+        bwdSRGB(logf(x.y + 1.0f), d_x.y, d_out.y);
+        d_x.y *= 1 / (x.y + 1.0f);
+    }
+    if (x.z > 0.0f && x.z < 65535.0f)
+    {
+        bwdSRGB(logf(x.z + 1.0f), d_x.z, d_out.z);
+        d_x.z *= 1 / (x.z + 1.0f);
+    }
+}
+
+__device__ inline float fwdRELMSE(float img, float target, float eps = 0.1f)
+{
+    return (img - target) * (img - target) / (img * img + target * target + eps);
+}
+
+__device__ inline void bwdRELMSE(float img, float target, float &d_img, float &d_target, float d_out, float eps = 0.1f)
+{
+    float denom  = (target * target + img * img + eps);
+    d_img    += d_out * 2 * (img - target) * (target * (target + img) + eps) / (denom * denom);
+    d_target -= d_out * 2 * (img - target) * (img * (target + img) + eps) / (denom * denom);
+}
+
+__device__ inline float fwdSMAPE(float img, float target, float eps=0.01f)
+{
+    return abs(img - target) / (img + target + eps);
+}
+
+__device__ inline void bwdSMAPE(float img, float target, float& d_img, float& d_target, float d_out, float eps = 0.01f)
+{
+    float denom = (target + img + eps);
+    d_img    += d_out * bwdAbs(img - target) * (2 * target + eps) / (denom * denom);
+    d_target -= d_out * bwdAbs(img - target) * (2 * img + eps) / (denom * denom);
+}
+
+//------------------------------------------------------------------------
+// Kernels
+
+__global__ void imgLossFwdKernel(LossKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+
+    float floss = 0.0f;
+    if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z)
+    {
+        vec3f img = p.img.fetch3(px, py, pz);
+        vec3f target = p.target.fetch3(px, py, pz);
+
+        img = vec3f(clamp(img.x, 0.0f, 65535.0f), clamp(img.y, 0.0f, 65535.0f), clamp(img.z, 0.0f, 65535.0f));
+        target = vec3f(clamp(target.x, 0.0f, 65535.0f), clamp(target.y, 0.0f, 65535.0f), clamp(target.z, 0.0f, 65535.0f));
+
+        if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+        {
+            img = fwdTonemapLogSRGB(img);
+            target = fwdTonemapLogSRGB(target);
+        }
+
+        vec3f vloss(0);
+        if (p.loss == LOSS_MSE)
+            vloss = (img - target) * (img - target);
+        else if (p.loss == LOSS_RELMSE)
+            vloss = vec3f(fwdRELMSE(img.x, target.x), fwdRELMSE(img.y, target.y), fwdRELMSE(img.z, target.z));
+        else if (p.loss == LOSS_SMAPE)
+            vloss = vec3f(fwdSMAPE(img.x, target.x), fwdSMAPE(img.y, target.y), fwdSMAPE(img.z, target.z));
+        else
+            vloss = vec3f(abs(img.x - target.x), abs(img.y - target.y), abs(img.z - target.z));
+        
+        floss = sum(vloss) / 3.0f;
+    }
+
+    floss = warpSum(floss);
+
+    dim3 warpSize = getWarpSize(blockDim);
+    if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z && threadIdx.x % warpSize.x == 0 && threadIdx.y % warpSize.y == 0 && threadIdx.z % warpSize.z == 0)
+        p.out.store(px / warpSize.x, py / warpSize.y, pz / warpSize.z, floss);
+}
+
+__global__ void imgLossBwdKernel(LossKernelParams p)
+{ 
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    dim3 warpSize = getWarpSize(blockDim);
+
+    vec3f _img = p.img.fetch3(px, py, pz);
+    vec3f _target = p.target.fetch3(px, py, pz);
+    float d_out = p.out.fetch1(px / warpSize.x, py / warpSize.y, pz / warpSize.z);
+
+    /////////////////////////////////////////////////////////////////////
+    // FWD
+
+    vec3f img = _img, target = _target;
+    if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+    {
+        img = fwdTonemapLogSRGB(img);
+        target = fwdTonemapLogSRGB(target);
+    }
+
+    /////////////////////////////////////////////////////////////////////
+    // BWD
+
+    vec3f d_vloss = vec3f(d_out, d_out, d_out) / 3.0f;
+
+    vec3f d_img(0), d_target(0);
+    if (p.loss == LOSS_MSE)
+    {
+        d_img = vec3f(d_vloss.x * 2 * (img.x - target.x), d_vloss.y * 2 * (img.y - target.y), d_vloss.x * 2 * (img.z - target.z));
+        d_target = -d_img;
+    }
+    else if (p.loss == LOSS_RELMSE)
+    {
+        bwdRELMSE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
+        bwdRELMSE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
+        bwdRELMSE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
+    }
+    else if (p.loss == LOSS_SMAPE)
+    {
+        bwdSMAPE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
+        bwdSMAPE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
+        bwdSMAPE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
+    }
+    else
+    {
+        d_img = d_vloss * vec3f(bwdAbs(img.x - target.x), bwdAbs(img.y - target.y), bwdAbs(img.z - target.z));
+        d_target = -d_img;
+    }
+
+
+    if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+    {
+        vec3f d__img(0), d__target(0);
+        bwdTonemapLogSRGB(_img, d__img, d_img);
+        bwdTonemapLogSRGB(_target, d__target, d_target);
+        d_img = d__img; d_target = d__target;
+    }
+
+    if (_img.x <= 0.0f || _img.x >= 65535.0f) d_img.x = 0;
+    if (_img.y <= 0.0f || _img.y >= 65535.0f) d_img.y = 0;
+    if (_img.z <= 0.0f || _img.z >= 65535.0f) d_img.z = 0;
+    if (_target.x <= 0.0f || _target.x >= 65535.0f) d_target.x = 0;
+    if (_target.y <= 0.0f || _target.y >= 65535.0f) d_target.y = 0;
+    if (_target.z <= 0.0f || _target.z >= 65535.0f) d_target.z = 0;
+
+    p.img.store_grad(px, py, pz, d_img);
+    p.target.store_grad(px, py, pz, d_target);
+}
--- a/render/renderutils/c_src/loss.h
+++ b/render/renderutils/c_src/loss.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "common.h"
+
+enum TonemapperType
+{
+    TONEMAPPER_NONE = 0,
+    TONEMAPPER_LOG_SRGB = 1
+};
+
+enum LossType
+{
+    LOSS_L1 = 0,
+    LOSS_MSE = 1,
+    LOSS_RELMSE = 2,
+    LOSS_SMAPE = 3
+};
+
+struct LossKernelParams
+{
+    Tensor          img;
+    Tensor          target;
+    Tensor          out;
+    dim3            gridSize;
+    TonemapperType  tonemapper;
+    LossType        loss;
+};
--- a/render/renderutils/c_src/mesh.cu
+++ b/render/renderutils/c_src/mesh.cu
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda.h>
+#include <stdio.h>
+
+#include "common.h"
+#include "mesh.h"
+
+
+//------------------------------------------------------------------------
+// Kernels
+
+__global__ void xfmPointsFwdKernel(XfmKernelParams p)
+{
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
+
+    __shared__ float mtx[4][4];
+    if (threadIdx.x < 16)
+        mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
+    __syncthreads();
+    
+    if (px >= p.gridSize.x)
+        return;
+
+    vec3f pos(
+        p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
+    );
+
+    if (p.isPoints)
+    {
+        p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0] + mtx[3][0]);
+        p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1] + mtx[3][1]);
+        p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2] + mtx[3][2]);
+        p.out.store(p.out.nhwcIndex(pz, px, 3, 0), pos.x * mtx[0][3] + pos.y * mtx[1][3] + pos.z * mtx[2][3] + mtx[3][3]);
+    }
+    else
+    {
+        p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0]);
+        p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1]);
+        p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2]);
+    }
+}
+
+__global__ void xfmPointsBwdKernel(XfmKernelParams p)
+{ 
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
+
+    __shared__ float mtx[4][4];
+    if (threadIdx.x < 16)
+        mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
+    __syncthreads();
+
+    if (px >= p.gridSize.x)
+        return;
+
+    vec3f pos(
+        p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
+        p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
+    );
+
+    vec4f d_out(
+        p.out.fetch(p.out.nhwcIndex(pz, px, 0, 0)),
+        p.out.fetch(p.out.nhwcIndex(pz, px, 1, 0)),
+        p.out.fetch(p.out.nhwcIndex(pz, px, 2, 0)),
+        p.out.fetch(p.out.nhwcIndex(pz, px, 3, 0))
+    );
+
+    if (p.isPoints)
+    {
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2] + d_out.w * mtx[0][3]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2] + d_out.w * mtx[1][3]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2] + d_out.w * mtx[2][3]);
+    }
+    else
+    {
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2]);
+        p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2]);
+    }
+}
--- a/render/renderutils/c_src/mesh.h
+++ b/render/renderutils/c_src/mesh.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct XfmKernelParams
+{
+    bool            isPoints;
+    Tensor          points;
+    Tensor          matrix;
+    Tensor          out;
+    dim3            gridSize;
+};
--- a/render/renderutils/c_src/normal.cu
+++ b/render/renderutils/c_src/normal.cu
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#include "common.h"
+#include "normal.h"
+
+#define NORMAL_THRESHOLD 0.1f
+
+//------------------------------------------------------------------------
+// Perturb shading normal by tangent frame
+
+__device__ vec3f fwdPerturbNormal(const vec3f perturbed_nrm, const vec3f smooth_nrm, const vec3f smooth_tng, bool opengl)
+{
+    vec3f _smooth_bitng = cross(smooth_tng, smooth_nrm);
+    vec3f smooth_bitng = safeNormalize(_smooth_bitng);
+    vec3f _shading_nrm = smooth_tng * perturbed_nrm.x + (opengl ? -1 : 1) * smooth_bitng * perturbed_nrm.y + smooth_nrm * max(perturbed_nrm.z, 0.0f);
+    return safeNormalize(_shading_nrm);
+}
+
+__device__ void bwdPerturbNormal(const vec3f perturbed_nrm, const vec3f smooth_nrm, const vec3f smooth_tng, vec3f &d_perturbed_nrm, vec3f &d_smooth_nrm, vec3f &d_smooth_tng, const vec3f d_out, bool opengl)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f _smooth_bitng = cross(smooth_tng, smooth_nrm);
+    vec3f smooth_bitng = safeNormalize(_smooth_bitng);
+    vec3f _shading_nrm = smooth_tng * perturbed_nrm.x + (opengl ? -1 : 1) * smooth_bitng * perturbed_nrm.y + smooth_nrm * max(perturbed_nrm.z, 0.0f);
+        
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+    vec3f d_shading_nrm(0);
+    bwdSafeNormalize(_shading_nrm, d_shading_nrm, d_out);
+
+    vec3f d_smooth_bitng(0);
+    
+    if (perturbed_nrm.z > 0.0f)
+    {
+        d_smooth_nrm += d_shading_nrm * perturbed_nrm.z;
+        d_perturbed_nrm.z += sum(d_shading_nrm * smooth_nrm);
+    }
+
+    d_smooth_bitng += (opengl ? -1 : 1) * d_shading_nrm * perturbed_nrm.y;
+    d_perturbed_nrm.y += (opengl ? -1 : 1) * sum(d_shading_nrm * smooth_bitng);
+
+    d_smooth_tng += d_shading_nrm * perturbed_nrm.x;
+    d_perturbed_nrm.x += sum(d_shading_nrm * smooth_tng);
+
+    vec3f d__smooth_bitng(0);
+    bwdSafeNormalize(_smooth_bitng, d__smooth_bitng, d_smooth_bitng);
+
+    bwdCross(smooth_tng, smooth_nrm, d_smooth_tng, d_smooth_nrm, d__smooth_bitng);
+}
+
+//------------------------------------------------------------------------
+#define bent_nrm_eps 0.001f
+
+__device__ vec3f fwdBendNormal(const vec3f view_vec, const vec3f smooth_nrm, const vec3f geom_nrm)
+{
+    float dp = dot(view_vec, smooth_nrm);
+    float t = clamp(dp / NORMAL_THRESHOLD, 0.0f, 1.0f);
+    return geom_nrm * (1.0f - t) + smooth_nrm * t;
+}
+
+__device__ void bwdBendNormal(const vec3f view_vec, const vec3f smooth_nrm, const vec3f geom_nrm, vec3f& d_view_vec, vec3f& d_smooth_nrm, vec3f& d_geom_nrm, const vec3f d_out)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    float dp = dot(view_vec, smooth_nrm);
+    float t = clamp(dp / NORMAL_THRESHOLD, 0.0f, 1.0f);
+
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+    if (dp > NORMAL_THRESHOLD)
+        d_smooth_nrm += d_out;
+    else
+    {
+        // geom_nrm * (1.0f - t) + smooth_nrm * t;
+        d_geom_nrm   += d_out * (1.0f - t);
+        d_smooth_nrm += d_out * t;
+        float d_t = sum(d_out * (smooth_nrm - geom_nrm));
+
+        float d_dp = dp < 0.0f || dp > NORMAL_THRESHOLD ? 0.0f : d_t / NORMAL_THRESHOLD;
+
+        bwdDot(view_vec, smooth_nrm, d_view_vec, d_smooth_nrm, d_dp);
+    }
+}
+
+//------------------------------------------------------------------------
+// Kernels
+
+__global__ void PrepareShadingNormalFwdKernel(PrepareShadingNormalKernelParams p) 
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f perturbed_nrm = p.perturbed_nrm.fetch3(px, py, pz);
+    vec3f _smooth_nrm = p.smooth_nrm.fetch3(px, py, pz);
+    vec3f _smooth_tng = p.smooth_tng.fetch3(px, py, pz);
+    vec3f geom_nrm = p.geom_nrm.fetch3(px, py, pz);
+
+    vec3f smooth_nrm = safeNormalize(_smooth_nrm);
+    vec3f smooth_tng = safeNormalize(_smooth_tng);
+    vec3f view_vec = safeNormalize(view_pos - pos);
+    vec3f shading_nrm = fwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, p.opengl);
+
+    vec3f res;
+    if (p.two_sided_shading && dot(view_vec, geom_nrm) < 0.0f)
+        res = fwdBendNormal(view_vec, -shading_nrm, -geom_nrm);
+    else
+        res = fwdBendNormal(view_vec, shading_nrm, geom_nrm);
+
+    p.out.store(px, py, pz, res);
+}
+
+__global__ void PrepareShadingNormalBwdKernel(PrepareShadingNormalKernelParams p) 
+{ 
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f perturbed_nrm = p.perturbed_nrm.fetch3(px, py, pz);
+    vec3f _smooth_nrm = p.smooth_nrm.fetch3(px, py, pz);
+    vec3f _smooth_tng = p.smooth_tng.fetch3(px, py, pz);
+    vec3f geom_nrm = p.geom_nrm.fetch3(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // FWD
+
+    vec3f smooth_nrm = safeNormalize(_smooth_nrm);
+    vec3f smooth_tng = safeNormalize(_smooth_tng);
+    vec3f _view_vec = view_pos - pos;
+    vec3f view_vec = safeNormalize(view_pos - pos);
+
+    vec3f shading_nrm = fwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, p.opengl);
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // BWD
+
+    vec3f d_view_vec(0), d_shading_nrm(0), d_geom_nrm(0);
+    if (p.two_sided_shading && dot(view_vec, geom_nrm) < 0.0f)
+    {
+        bwdBendNormal(view_vec, -shading_nrm, -geom_nrm, d_view_vec, d_shading_nrm, d_geom_nrm, d_out);
+        d_shading_nrm = -d_shading_nrm;
+        d_geom_nrm = -d_geom_nrm;
+    }
+    else
+        bwdBendNormal(view_vec, shading_nrm, geom_nrm, d_view_vec, d_shading_nrm, d_geom_nrm, d_out);
+
+    vec3f d_perturbed_nrm(0), d_smooth_nrm(0), d_smooth_tng(0);
+    bwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, d_perturbed_nrm, d_smooth_nrm, d_smooth_tng, d_shading_nrm, p.opengl);
+
+    vec3f d__view_vec(0), d__smooth_nrm(0), d__smooth_tng(0);
+    bwdSafeNormalize(_view_vec, d__view_vec, d_view_vec);
+    bwdSafeNormalize(_smooth_nrm, d__smooth_nrm, d_smooth_nrm);
+    bwdSafeNormalize(_smooth_tng, d__smooth_tng, d_smooth_tng);
+
+    p.pos.store_grad(px, py, pz, -d__view_vec);
+    p.view_pos.store_grad(px, py, pz, d__view_vec);
+    p.perturbed_nrm.store_grad(px, py, pz, d_perturbed_nrm);
+    p.smooth_nrm.store_grad(px, py, pz, d__smooth_nrm);
+    p.smooth_tng.store_grad(px, py, pz, d__smooth_tng);
+    p.geom_nrm.store_grad(px, py, pz, d_geom_nrm);
+}
--- a/render/renderutils/c_src/normal.h
+++ b/render/renderutils/c_src/normal.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+
+#include "common.h"
+
+struct PrepareShadingNormalKernelParams
+{
+    Tensor  pos;
+    Tensor  view_pos;
+    Tensor  perturbed_nrm;
+    Tensor  smooth_nrm;
+    Tensor  smooth_tng;
+    Tensor  geom_nrm;
+    Tensor  out;
+    dim3    gridSize;
+    bool    two_sided_shading, opengl;
+};
--- a/render/renderutils/c_src/tensor.h
+++ b/render/renderutils/c_src/tensor.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#if defined(__CUDACC__) && defined(BFLOAT16)
+#include <cuda_bf16.h> // bfloat16 is float32 compatible with less mantissa bits
+#endif
+
+//---------------------------------------------------------------------------------
+// CUDA-side Tensor class for in/out parameter parsing. Can be float32 or bfloat16
+
+struct Tensor
+{
+    void*   val;
+    void*   d_val;
+    int     dims[4], _dims[4];
+    int     strides[4];
+    bool    fp16;
+
+#if defined(__CUDA__) && !defined(__CUDA_ARCH__)
+    Tensor() : val(nullptr), d_val(nullptr), fp16(true), dims{ 0, 0, 0, 0 }, _dims{ 0, 0, 0, 0 }, strides{ 0, 0, 0, 0 } {}
+#endif
+
+#ifdef __CUDACC__
+    // Helpers to index and read/write a single element
+    __device__ inline int   _nhwcIndex(int n, int h, int w, int c) const { return n * strides[0] + h * strides[1] + w * strides[2] + c * strides[3]; }
+    __device__ inline int   nhwcIndex(int n, int h, int w, int c) const { return (dims[0] == 1 ? 0 : n * strides[0]) + (dims[1] == 1 ? 0 : h * strides[1]) + (dims[2] == 1 ? 0 : w * strides[2]) + (dims[3] == 1 ? 0 : c * strides[3]); }
+    __device__ inline int   nhwcIndexContinuous(int n, int h, int w, int c) const { return ((n * _dims[1] + h) * _dims[2] + w) * _dims[3] + c; }
+#ifdef BFLOAT16
+    __device__ inline float fetch(unsigned int idx) const { return fp16 ? __bfloat162float(((__nv_bfloat16*)val)[idx]) : ((float*)val)[idx]; }
+    __device__ inline void  store(unsigned int idx, float _val) { if (fp16) ((__nv_bfloat16*)val)[idx] = __float2bfloat16(_val); else ((float*)val)[idx] = _val; }
+    __device__ inline void  store_grad(unsigned int idx, float _val) { if (fp16) ((__nv_bfloat16*)d_val)[idx] = __float2bfloat16(_val); else ((float*)d_val)[idx] = _val; }
+#else
+    __device__ inline float fetch(unsigned int idx) const { return ((float*)val)[idx]; }
+    __device__ inline void  store(unsigned int idx, float _val) { ((float*)val)[idx] = _val; }
+    __device__ inline void  store_grad(unsigned int idx, float _val) { ((float*)d_val)[idx] = _val; }
+#endif
+
+    //////////////////////////////////////////////////////////////////////////////////////////
+    // Fetch, use broadcasting for tensor dimensions of size 1
+    __device__ inline float fetch1(unsigned int x, unsigned int y, unsigned int z) const
+    {
+        return fetch(nhwcIndex(z, y, x, 0));
+    }
+
+    __device__ inline vec3f fetch3(unsigned int x, unsigned int y, unsigned int z) const
+    {
+        return vec3f(
+            fetch(nhwcIndex(z, y, x, 0)),
+            fetch(nhwcIndex(z, y, x, 1)),
+            fetch(nhwcIndex(z, y, x, 2))
+        );
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Store, no broadcasting here. Assume we output full res gradient and then reduce using torch.sum outside
+    __device__ inline void store(unsigned int x, unsigned int y, unsigned int z, float _val)
+    {
+        store(_nhwcIndex(z, y, x, 0), _val);
+    }
+
+    __device__ inline void store(unsigned int x, unsigned int y, unsigned int z, vec3f _val)
+    {
+        store(_nhwcIndex(z, y, x, 0), _val.x);
+        store(_nhwcIndex(z, y, x, 1), _val.y);
+        store(_nhwcIndex(z, y, x, 2), _val.z);
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Store gradient , no broadcasting here. Assume we output full res gradient and then reduce using torch.sum outside
+    __device__ inline void store_grad(unsigned int x, unsigned int y, unsigned int z, float _val)
+    {
+        store_grad(nhwcIndexContinuous(z, y, x, 0), _val);
+    }
+
+    __device__ inline void store_grad(unsigned int x, unsigned int y, unsigned int z, vec3f _val)
+    {
+        store_grad(nhwcIndexContinuous(z, y, x, 0), _val.x);
+        store_grad(nhwcIndexContinuous(z, y, x, 1), _val.y);
+        store_grad(nhwcIndexContinuous(z, y, x, 2), _val.z);
+    }
+#endif
+
+};
--- a/render/renderutils/c_src/torch_bindings.cpp
+++ b/render/renderutils/c_src/torch_bindings.cpp
--- a/render/renderutils/c_src/vec3f.h
+++ b/render/renderutils/c_src/vec3f.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once 
+
+struct vec3f
+{
+    float x, y, z;
+
+#ifdef __CUDACC__
+    __device__ vec3f() { }
+    __device__ vec3f(float v) { x = v; y = v; z = v; }
+    __device__ vec3f(float _x, float _y, float _z) { x = _x; y = _y; z = _z; }
+    __device__ vec3f(float3 v) { x = v.x; y = v.y; z = v.z; }
+
+    __device__ inline vec3f& operator+=(const vec3f& b) { x += b.x; y += b.y; z += b.z; return *this; }
+    __device__ inline vec3f& operator-=(const vec3f& b) { x -= b.x; y -= b.y; z -= b.z; return *this; }
+    __device__ inline vec3f& operator*=(const vec3f& b) { x *= b.x; y *= b.y; z *= b.z; return *this; }
+    __device__ inline vec3f& operator/=(const vec3f& b) { x /= b.x; y /= b.y; z /= b.z; return *this; }
+#endif
+};
+
+#ifdef __CUDACC__
+__device__ static inline vec3f operator+(const vec3f& a, const vec3f& b) { return vec3f(a.x + b.x, a.y + b.y, a.z + b.z); }
+__device__ static inline vec3f operator-(const vec3f& a, const vec3f& b) { return vec3f(a.x - b.x, a.y - b.y, a.z - b.z); }
+__device__ static inline vec3f operator*(const vec3f& a, const vec3f& b) { return vec3f(a.x * b.x, a.y * b.y, a.z * b.z); }
+__device__ static inline vec3f operator/(const vec3f& a, const vec3f& b) { return vec3f(a.x / b.x, a.y / b.y, a.z / b.z); }
+__device__ static inline vec3f operator-(const vec3f& a) { return vec3f(-a.x, -a.y, -a.z); }
+
+__device__ static inline float sum(vec3f a)
+{
+    return a.x + a.y + a.z;
+}
+
+__device__ static inline vec3f cross(vec3f a, vec3f b)
+{
+    vec3f out;
+    out.x = a.y * b.z - a.z * b.y;
+    out.y = a.z * b.x - a.x * b.z;
+    out.z = a.x * b.y - a.y * b.x;
+    return out;
+}
+
+__device__ static inline void bwdCross(vec3f a, vec3f b, vec3f &d_a, vec3f &d_b, vec3f d_out)
+{
+    d_a.x += d_out.z * b.y - d_out.y * b.z;
+    d_a.y += d_out.x * b.z - d_out.z * b.x;
+    d_a.z += d_out.y * b.x - d_out.x * b.y;
+
+    d_b.x += d_out.y * a.z - d_out.z * a.y;
+    d_b.y += d_out.z * a.x - d_out.x * a.z;
+    d_b.z += d_out.x * a.y - d_out.y * a.x;
+}
+
+__device__ static inline float dot(vec3f a, vec3f b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__ static inline void bwdDot(vec3f a, vec3f b, vec3f& d_a, vec3f& d_b, float d_out)
+{
+    d_a.x += d_out * b.x; d_a.y += d_out * b.y; d_a.z += d_out * b.z;
+    d_b.x += d_out * a.x; d_b.y += d_out * a.y; d_b.z += d_out * a.z;
+}
+
+__device__ static inline vec3f reflect(vec3f x, vec3f n)
+{
+    return n * 2.0f * dot(n, x) - x;
+}
+
+__device__ static inline void bwdReflect(vec3f x, vec3f n, vec3f& d_x, vec3f& d_n, const vec3f d_out)
+{
+    d_x.x += d_out.x * (2 * n.x * n.x - 1) + d_out.y * (2 * n.x * n.y) + d_out.z * (2 * n.x * n.z);
+    d_x.y += d_out.x * (2 * n.x * n.y) + d_out.y * (2 * n.y * n.y - 1) + d_out.z * (2 * n.y * n.z);
+    d_x.z += d_out.x * (2 * n.x * n.z) + d_out.y * (2 * n.y * n.z) + d_out.z * (2 * n.z * n.z - 1);
+
+    d_n.x += d_out.x * (2 * (2 * n.x * x.x + n.y * x.y + n.z * x.z)) + d_out.y * (2 * n.y * x.x) + d_out.z * (2 * n.z * x.x);
+    d_n.y += d_out.x * (2 * n.x * x.y) + d_out.y * (2 * (n.x * x.x + 2 * n.y * x.y + n.z * x.z)) + d_out.z * (2 * n.z * x.y);
+    d_n.z += d_out.x * (2 * n.x * x.z) + d_out.y * (2 * n.y * x.z) + d_out.z * (2 * (n.x * x.x + n.y * x.y + 2 * n.z * x.z));
+}
+
+__device__ static inline vec3f safeNormalize(vec3f v)
+{
+    float l = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+    return l > 0.0f ? (v / l) : vec3f(0.0f);
+}
+
+__device__ static inline void bwdSafeNormalize(const vec3f v, vec3f& d_v, const vec3f d_out)
+{
+
+    float l = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+    if (l > 0.0f)
+    {
+        float fac = 1.0 / powf(v.x * v.x + v.y * v.y + v.z * v.z, 1.5f);
+        d_v.x += (d_out.x * (v.y * v.y + v.z * v.z) - d_out.y * (v.x * v.y) - d_out.z * (v.x * v.z)) * fac;
+        d_v.y += (d_out.y * (v.x * v.x + v.z * v.z) - d_out.x * (v.y * v.x) - d_out.z * (v.y * v.z)) * fac;
+        d_v.z += (d_out.z * (v.x * v.x + v.y * v.y) - d_out.x * (v.z * v.x) - d_out.y * (v.z * v.y)) * fac;
+    }
+}
+
+#endif
--- a/render/renderutils/c_src/vec4f.h
+++ b/render/renderutils/c_src/vec4f.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related 
+ * documentation and any modifications thereto. Any use, reproduction, 
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or 
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once 
+
+struct vec4f
+{
+    float x, y, z, w;
+
+#ifdef __CUDACC__
+    __device__ vec4f() { }
+    __device__ vec4f(float v) { x = v; y = v; z = v; w = v; }
+    __device__ vec4f(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
+    __device__ vec4f(float4 v) { x = v.x; y = v.y; z = v.z; w = v.w; }
+#endif
+};
+
--- a/render/renderutils/loss.py
+++ b/render/renderutils/loss.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import torch
+
+#----------------------------------------------------------------------------
+# HDR image losses
+#----------------------------------------------------------------------------
+
+def _tonemap_srgb(f):
+    return torch.where(f > 0.0031308, torch.pow(torch.clamp(f, min=0.0031308), 1.0/2.4)*1.055 - 0.055, 12.92*f)
+
+def _SMAPE(img, target, eps=0.01):
+    nom = torch.abs(img - target)
+    denom = torch.abs(img) + torch.abs(target) + 0.01
+    return torch.mean(nom / denom)
+
+def _RELMSE(img, target, eps=0.1):
+    nom = (img - target) * (img - target)
+    denom = img * img + target * target + 0.1 
+    return torch.mean(nom / denom)
+
+def image_loss_fn(img, target, loss, tonemapper):
+    if tonemapper == 'log_srgb':
+        img    = _tonemap_srgb(torch.log(torch.clamp(img, min=0, max=65535) + 1))
+        target = _tonemap_srgb(torch.log(torch.clamp(target, min=0, max=65535) + 1))
+
+    if loss == 'mse':
+        return torch.nn.functional.mse_loss(img, target)
+    elif loss == 'smape':
+        return _SMAPE(img, target)
+    elif loss == 'relmse':
+        return _RELMSE(img, target)
+    else:
+        return torch.nn.functional.l1_loss(img, target)
--- a/render/renderutils/ops.py
+++ b/render/renderutils/ops.py
@@ -0,0 +1,554 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import numpy as np
+import os
+import sys
+import torch
+import torch.utils.cpp_extension
+
+from .bsdf import *
+from .loss import *
+
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+
+_cached_plugin = None
+def _get_plugin():
+    # Return cached plugin if already loaded.
+    global _cached_plugin
+    if _cached_plugin is not None:
+        return _cached_plugin
+
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        def find_cl_path():
+            import glob
+            for edition in ['Enterprise', 'Professional', 'BuildTools', 'Community']:
+                paths = sorted(glob.glob(r"C:\Program Files (x86)\Microsoft Visual Studio\*\%s\VC\Tools\MSVC\*\bin\Hostx64\x64" % edition), reverse=True)
+                if paths:
+                    return paths[0]
+
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+
+    # Compiler options.
+    opts = ['-DNVDR_TORCH']
+
+    # Linker options.
+    if os.name == 'posix':
+        ldflags = ['-lcuda', '-lnvrtc']
+    elif os.name == 'nt':
+        ldflags = ['cuda.lib', 'advapi32.lib', 'nvrtc.lib']
+
+    # List of sources.
+    source_files = [
+        'c_src/mesh.cu',
+        'c_src/loss.cu',
+        'c_src/bsdf.cu',
+        'c_src/normal.cu',
+        'c_src/cubemap.cu',
+        'c_src/common.cpp',
+        'c_src/torch_bindings.cpp'
+    ]
+
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory('renderutils_plugin', False), 'lock')
+        if os.path.exists(lock_fn):
+            print("Warning: Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name='renderutils_plugin', sources=source_paths, extra_cflags=opts,
+         extra_cuda_cflags=opts, extra_ldflags=ldflags, with_cuda=True, verbose=True)
+
+    # Import, cache, and return the compiled module.
+    import renderutils_plugin
+    _cached_plugin = renderutils_plugin
+    return _cached_plugin
+
+#----------------------------------------------------------------------------
+# Internal kernels, just used for testing functionality
+
+class _fresnel_shlick_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, f0, f90, cosTheta):
+        out = _get_plugin().fresnel_shlick_fwd(f0, f90, cosTheta, False)
+        ctx.save_for_backward(f0, f90, cosTheta)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        f0, f90, cosTheta = ctx.saved_variables
+        return _get_plugin().fresnel_shlick_bwd(f0, f90, cosTheta, dout) + (None,)
+
+def _fresnel_shlick(f0, f90, cosTheta, use_python=False):
+    if use_python:
+        out = bsdf_fresnel_shlick(f0, f90, cosTheta)
+    else:
+        out = _fresnel_shlick_func.apply(f0, f90, cosTheta)
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _fresnel_shlick contains inf or NaN"
+    return out
+
+
+class _ndf_ggx_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alphaSqr, cosTheta):
+        out = _get_plugin().ndf_ggx_fwd(alphaSqr, cosTheta, False)
+        ctx.save_for_backward(alphaSqr, cosTheta)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        alphaSqr, cosTheta = ctx.saved_variables
+        return _get_plugin().ndf_ggx_bwd(alphaSqr, cosTheta, dout) + (None,)
+
+def _ndf_ggx(alphaSqr, cosTheta, use_python=False):
+    if use_python:
+        out = bsdf_ndf_ggx(alphaSqr, cosTheta)
+    else:
+        out = _ndf_ggx_func.apply(alphaSqr, cosTheta)
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _ndf_ggx contains inf or NaN"
+    return out
+
+class _lambda_ggx_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alphaSqr, cosTheta):
+        out = _get_plugin().lambda_ggx_fwd(alphaSqr, cosTheta, False)
+        ctx.save_for_backward(alphaSqr, cosTheta)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        alphaSqr, cosTheta = ctx.saved_variables
+        return _get_plugin().lambda_ggx_bwd(alphaSqr, cosTheta, dout) + (None,)
+
+def _lambda_ggx(alphaSqr, cosTheta, use_python=False):
+    if use_python:
+        out = bsdf_lambda_ggx(alphaSqr, cosTheta)
+    else:
+        out = _lambda_ggx_func.apply(alphaSqr, cosTheta)
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _lambda_ggx contains inf or NaN"
+    return out
+
+class _masking_smith_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, alphaSqr, cosThetaI, cosThetaO):
+        ctx.save_for_backward(alphaSqr, cosThetaI, cosThetaO)
+        out = _get_plugin().masking_smith_fwd(alphaSqr, cosThetaI, cosThetaO, False)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        alphaSqr, cosThetaI, cosThetaO = ctx.saved_variables
+        return _get_plugin().masking_smith_bwd(alphaSqr, cosThetaI, cosThetaO, dout) + (None,)
+
+def _masking_smith(alphaSqr, cosThetaI, cosThetaO, use_python=False):
+    if use_python:
+        out = bsdf_masking_smith_ggx_correlated(alphaSqr, cosThetaI, cosThetaO)
+    else:
+        out = _masking_smith_func.apply(alphaSqr, cosThetaI, cosThetaO)
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of _masking_smith contains inf or NaN"
+    return out
+
+#----------------------------------------------------------------------------
+# Shading normal setup (bump mapping + bent normals)
+
+class _prepare_shading_normal_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl):
+        ctx.two_sided_shading, ctx.opengl = two_sided_shading, opengl
+        out = _get_plugin().prepare_shading_normal_fwd(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl, False)
+        ctx.save_for_backward(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm = ctx.saved_variables
+        return _get_plugin().prepare_shading_normal_bwd(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, dout, ctx.two_sided_shading, ctx.opengl) + (None, None, None)
+
+def prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading=True, opengl=True, use_python=False):
+    '''Takes care of all corner cases and produces a final normal used for shading:
+        - Constructs tangent space
+        - Flips normal direction based on geometric normal for two sided Shading
+        - Perturbs shading normal by normal map
+        - Bends backfacing normals towards the camera to avoid shading artifacts
+
+        All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent.
+
+    Args:
+        pos: World space g-buffer position.
+        view_pos: Camera position in world space (typically using broadcasting).
+        perturbed_nrm: Trangent-space normal perturbation from normal map lookup.
+        smooth_nrm: Interpolated vertex normals.
+        smooth_tng: Interpolated vertex tangents.
+        geom_nrm: Geometric (face) normals.
+        two_sided_shading: Use one/two sided shading
+        opengl: Use OpenGL/DirectX normal map conventions 
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Final shading normal
+    '''    
+
+    if perturbed_nrm is None:
+        perturbed_nrm = torch.tensor([0, 0, 1], dtype=torch.float32, device='cuda', requires_grad=False)[None, None, None, ...]
+    
+    if use_python:
+        out = bsdf_prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl)
+    else:
+        out = _prepare_shading_normal_func.apply(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl)
+    
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of prepare_shading_normal contains inf or NaN"
+    return out
+
+#----------------------------------------------------------------------------
+# BSDF functions
+
+class _lambert_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, nrm, wi):
+        out = _get_plugin().lambert_fwd(nrm, wi, False)
+        ctx.save_for_backward(nrm, wi)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        nrm, wi = ctx.saved_variables
+        return _get_plugin().lambert_bwd(nrm, wi, dout) + (None,)
+
+def lambert(nrm, wi, use_python=False):
+    '''Lambertian bsdf. 
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent.
+
+    Args:
+        nrm: World space shading normal.
+        wi: World space light vector.
+        use_python: Use PyTorch implementation (for validation)
+
+    Returns:
+        Shaded diffuse value with shape [minibatch_size, height, width, 1]
+    '''
+
+    if use_python:
+        out = bsdf_lambert(nrm, wi)
+    else:
+        out = _lambert_func.apply(nrm, wi)
+ 
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of lambert contains inf or NaN"
+    return out
+
+class _frostbite_diffuse_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, nrm, wi, wo, linearRoughness):
+        out = _get_plugin().frostbite_fwd(nrm, wi, wo, linearRoughness, False)
+        ctx.save_for_backward(nrm, wi, wo, linearRoughness)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        nrm, wi, wo, linearRoughness = ctx.saved_variables
+        return _get_plugin().frostbite_bwd(nrm, wi, wo, linearRoughness, dout) + (None,)
+
+def frostbite_diffuse(nrm, wi, wo, linearRoughness, use_python=False):
+    '''Frostbite, normalized Disney Diffuse bsdf. 
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent.
+
+    Args:
+        nrm: World space shading normal.
+        wi: World space light vector.
+        wo: World space camera vector.
+        linearRoughness: Material roughness
+        use_python: Use PyTorch implementation (for validation)
+
+    Returns:
+        Shaded diffuse value with shape [minibatch_size, height, width, 1]
+    '''
+
+    if use_python:
+        out = bsdf_frostbite(nrm, wi, wo, linearRoughness)
+    else:
+        out = _frostbite_diffuse_func.apply(nrm, wi, wo, linearRoughness)
+ 
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of lambert contains inf or NaN"
+    return out
+
+class _pbr_specular_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, col, nrm, wo, wi, alpha, min_roughness):
+        ctx.save_for_backward(col, nrm, wo, wi, alpha)
+        ctx.min_roughness = min_roughness
+        out = _get_plugin().pbr_specular_fwd(col, nrm, wo, wi, alpha, min_roughness, False)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        col, nrm, wo, wi, alpha = ctx.saved_variables
+        return _get_plugin().pbr_specular_bwd(col, nrm, wo, wi, alpha, ctx.min_roughness, dout) + (None, None)
+
+def pbr_specular(col, nrm, wo, wi, alpha, min_roughness=0.08, use_python=False):
+    '''Physically-based specular bsdf.
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent unless otherwise noted.
+
+    Args:
+        col: Specular lobe color
+        nrm: World space shading normal.
+        wo: World space camera vector.
+        wi: World space light vector
+        alpha: Specular roughness parameter with shape [minibatch_size, height, width, 1]
+        min_roughness: Scalar roughness clamping threshold
+
+        use_python: Use PyTorch implementation (for validation)
+    Returns:
+        Shaded specular color
+    '''
+
+    if use_python:
+        out = bsdf_pbr_specular(col, nrm, wo, wi, alpha, min_roughness=min_roughness)
+    else:
+        out = _pbr_specular_func.apply(col, nrm, wo, wi, alpha, min_roughness)
+    
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of pbr_specular contains inf or NaN"
+    return out
+
+class _pbr_bsdf_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF):
+        ctx.save_for_backward(kd, arm, pos, nrm, view_pos, light_pos)
+        ctx.min_roughness = min_roughness
+        ctx.BSDF = BSDF
+        out = _get_plugin().pbr_bsdf_fwd(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF, False)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        kd, arm, pos, nrm, view_pos, light_pos = ctx.saved_variables
+        return _get_plugin().pbr_bsdf_bwd(kd, arm, pos, nrm, view_pos, light_pos, ctx.min_roughness, ctx.BSDF, dout) + (None, None, None)
+
+def pbr_bsdf(kd, arm, pos, nrm, view_pos, light_pos, min_roughness=0.08, bsdf="lambert", use_python=False):
+    '''Physically-based bsdf, both diffuse & specular lobes
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent unless otherwise noted.
+
+    Args:
+        kd: Diffuse albedo.
+        arm: Specular parameters (attenuation, linear roughness, metalness).
+        pos: World space position.
+        nrm: World space shading normal.
+        view_pos: Camera position in world space, typically using broadcasting.
+        light_pos: Light position in world space, typically using broadcasting.
+        min_roughness: Scalar roughness clamping threshold
+        bsdf: Controls diffuse BSDF, can be either 'lambert' or 'frostbite'
+
+        use_python: Use PyTorch implementation (for validation)
+
+    Returns:
+        Shaded color.
+    '''    
+
+    BSDF = 0 
+    if bsdf == 'frostbite':
+        BSDF = 1
+
+    if use_python:
+        out = bsdf_pbr(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF)
+    else:
+        out = _pbr_bsdf_func.apply(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF)
+    
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of pbr_bsdf contains inf or NaN"
+    return out
+
+#----------------------------------------------------------------------------
+# cubemap filter with filtering across edges
+
+class _diffuse_cubemap_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, cubemap):
+        out = _get_plugin().diffuse_cubemap_fwd(cubemap)
+        ctx.save_for_backward(cubemap)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        cubemap, = ctx.saved_variables
+        cubemap_grad = _get_plugin().diffuse_cubemap_bwd(cubemap, dout)
+        return cubemap_grad, None
+
+def diffuse_cubemap(cubemap, use_python=False):
+    if use_python:
+        assert False
+    else:
+        out = _diffuse_cubemap_func.apply(cubemap)
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of diffuse_cubemap contains inf or NaN"
+    return out
+
+class _specular_cubemap(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, cubemap, roughness, costheta_cutoff, bounds):
+        out = _get_plugin().specular_cubemap_fwd(cubemap, bounds, roughness, costheta_cutoff)
+        ctx.save_for_backward(cubemap, bounds)
+        ctx.roughness, ctx.theta_cutoff = roughness, costheta_cutoff
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        cubemap, bounds = ctx.saved_variables
+        cubemap_grad = _get_plugin().specular_cubemap_bwd(cubemap, bounds, dout, ctx.roughness, ctx.theta_cutoff)
+        return cubemap_grad, None, None, None
+
+# Compute the bounds of the GGX NDF lobe to retain "cutoff" percent of the energy
+def __ndfBounds(res, roughness, cutoff):
+    def ndfGGX(alphaSqr, costheta):
+        costheta = np.clip(costheta, 0.0, 1.0)
+        d = (costheta * alphaSqr - costheta) * costheta + 1.0
+        return alphaSqr / (d * d * np.pi)
+
+    # Sample out cutoff angle
+    nSamples = 1000000
+    costheta = np.cos(np.linspace(0, np.pi/2.0, nSamples))
+    D = np.cumsum(ndfGGX(roughness**4, costheta))
+    idx = np.argmax(D >= D[..., -1] * cutoff)
+
+    # Brute force compute lookup table with bounds
+    bounds = _get_plugin().specular_bounds(res, costheta[idx])
+
+    return costheta[idx], bounds
+__ndfBoundsDict = {}
+
+def specular_cubemap(cubemap, roughness, cutoff=0.99, use_python=False):
+    assert cubemap.shape[0] == 6 and cubemap.shape[1] == cubemap.shape[2], "Bad shape for cubemap tensor: %s" % str(cubemap.shape)
+
+    if use_python:
+        assert False
+    else:
+        key = (cubemap.shape[1], roughness, cutoff)
+        if key not in __ndfBoundsDict:
+            __ndfBoundsDict[key] = __ndfBounds(*key)
+        out = _specular_cubemap.apply(cubemap, roughness, *__ndfBoundsDict[key])
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of specular_cubemap contains inf or NaN"
+    return out[..., 0:3] / out[..., 3:]
+
+#----------------------------------------------------------------------------
+# Fast image loss function
+
+class _image_loss_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, img, target, loss, tonemapper):
+        ctx.loss, ctx.tonemapper = loss, tonemapper
+        ctx.save_for_backward(img, target)
+        out = _get_plugin().image_loss_fwd(img, target, loss, tonemapper, False)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        img, target = ctx.saved_variables
+        return _get_plugin().image_loss_bwd(img, target, dout, ctx.loss, ctx.tonemapper) + (None, None, None)
+
+def image_loss(img, target, loss='l1', tonemapper='none', use_python=False):
+    '''Compute HDR image loss. Combines tonemapping and loss into a single kernel for better perf.
+    All tensors assume a shape of [minibatch_size, height, width, 3] or broadcastable equivalent unless otherwise noted.
+
+    Args:
+        img: Input image.
+        target: Target (reference) image. 
+        loss: Type of loss. Valid options are ['l1', 'mse', 'smape', 'relmse']
+        tonemapper: Tonemapping operations. Valid options are ['none', 'log_srgb']
+        use_python: Use PyTorch implementation (for validation)
+
+    Returns:
+        Image space loss (scalar value).
+    '''
+    if use_python:
+        out = image_loss_fn(img, target, loss, tonemapper)
+    else:
+        out = _image_loss_func.apply(img, target, loss, tonemapper)
+        out = torch.sum(out) / (img.shape[0]*img.shape[1]*img.shape[2])
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of image_loss contains inf or NaN"
+    return out
+
+#----------------------------------------------------------------------------
+# Transform points function
+
+class _xfm_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, points, matrix, isPoints):
+        ctx.save_for_backward(points, matrix)
+        ctx.isPoints = isPoints
+        return _get_plugin().xfm_fwd(points, matrix, isPoints, False)
+
+    @staticmethod
+    def backward(ctx, dout):
+        points, matrix = ctx.saved_variables
+        return (_get_plugin().xfm_bwd(points, matrix, dout, ctx.isPoints),) + (None, None, None)
+
+def xfm_points(points, matrix, use_python=False):
+    '''Transform points.
+    Args:
+        points: Tensor containing 3D points with shape [minibatch_size, num_vertices, 3] or [1, num_vertices, 3]
+        matrix: A 4x4 transform matrix with shape [minibatch_size, 4, 4]
+        use_python: Use PyTorch's torch.matmul (for validation)
+    Returns:
+        Transformed points in homogeneous 4D with shape [minibatch_size, num_vertices, 4].
+    '''    
+    if use_python:
+        out = torch.matmul(torch.nn.functional.pad(points, pad=(0,1), mode='constant', value=1.0), torch.transpose(matrix, 1, 2))
+    else:
+        out = _xfm_func.apply(points, matrix, True)
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of xfm_points contains inf or NaN"
+    return out
+
+def xfm_vectors(vectors, matrix, use_python=False):
+    '''Transform vectors.
+    Args:
+        vectors: Tensor containing 3D vectors with shape [minibatch_size, num_vertices, 3] or [1, num_vertices, 3]
+        matrix: A 4x4 transform matrix with shape [minibatch_size, 4, 4]
+        use_python: Use PyTorch's torch.matmul (for validation)
+
+    Returns:
+        Transformed vectors in homogeneous 4D with shape [minibatch_size, num_vertices, 4].
+    '''    
+
+    if use_python:
+        out = torch.matmul(torch.nn.functional.pad(vectors, pad=(0,1), mode='constant', value=0.0), torch.transpose(matrix, 1, 2))[..., 0:3].contiguous()
+    else:
+        out = _xfm_func.apply(vectors, matrix, False)
+
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of xfm_vectors contains inf or NaN"
+    return out
+
+
+
--- a/render/renderutils/tests/test_bsdf.py
+++ b/render/renderutils/tests/test_bsdf.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import torch
+
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+
+RES = 4
+DTYPE = torch.float32
+
+def relative_loss(name, ref, cuda):
+	ref = ref.float()
+	cuda = cuda.float()
+	print(name, torch.max(torch.abs(ref - cuda) / torch.abs(ref + 1e-7)).item())
+
+def test_normal():
+	pos_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	pos_ref = pos_cuda.clone().detach().requires_grad_(True)
+	view_pos_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	view_pos_ref = view_pos_cuda.clone().detach().requires_grad_(True)
+	perturbed_nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	perturbed_nrm_ref = perturbed_nrm_cuda.clone().detach().requires_grad_(True)
+	smooth_nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	smooth_nrm_ref = smooth_nrm_cuda.clone().detach().requires_grad_(True)
+	smooth_tng_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	smooth_tng_ref = smooth_tng_cuda.clone().detach().requires_grad_(True)
+	geom_nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	geom_nrm_ref = geom_nrm_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+
+	ref = ru.prepare_shading_normal(pos_ref, view_pos_ref, perturbed_nrm_ref, smooth_nrm_ref, smooth_tng_ref, geom_nrm_ref, True, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru.prepare_shading_normal(pos_cuda, view_pos_cuda, perturbed_nrm_cuda, smooth_nrm_cuda, smooth_tng_cuda, geom_nrm_cuda, True)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    bent normal")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("pos:", pos_ref.grad, pos_cuda.grad)
+	relative_loss("view_pos:", view_pos_ref.grad, view_pos_cuda.grad)
+	relative_loss("perturbed_nrm:", perturbed_nrm_ref.grad, perturbed_nrm_cuda.grad)
+	relative_loss("smooth_nrm:", smooth_nrm_ref.grad, smooth_nrm_cuda.grad)
+	relative_loss("smooth_tng:", smooth_tng_ref.grad, smooth_tng_cuda.grad)
+	relative_loss("geom_nrm:", geom_nrm_ref.grad, geom_nrm_cuda.grad)
+
+def test_schlick():
+	f0_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	f0_ref = f0_cuda.clone().detach().requires_grad_(True)
+	f90_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	f90_ref = f90_cuda.clone().detach().requires_grad_(True)
+	cosT_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True) * 2.0
+	cosT_cuda = cosT_cuda.clone().detach().requires_grad_(True)
+	cosT_ref = cosT_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+
+	ref = ru._fresnel_shlick(f0_ref, f90_ref, cosT_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru._fresnel_shlick(f0_cuda, f90_cuda, cosT_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Fresnel shlick")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("f0:", f0_ref.grad, f0_cuda.grad)
+	relative_loss("f90:", f90_ref.grad, f90_cuda.grad)
+	relative_loss("cosT:", cosT_ref.grad, cosT_cuda.grad)
+
+def test_ndf_ggx():
+	alphaSqr_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alphaSqr_cuda = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	alphaSqr_ref = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	cosT_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True) * 3.0 - 1
+	cosT_cuda = cosT_cuda.clone().detach().requires_grad_(True)
+	cosT_ref = cosT_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+
+	ref = ru._ndf_ggx(alphaSqr_ref, cosT_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru._ndf_ggx(alphaSqr_cuda, cosT_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Ndf GGX")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("alpha:", alphaSqr_ref.grad, alphaSqr_cuda.grad)
+	relative_loss("cosT:", cosT_ref.grad, cosT_cuda.grad)
+
+def test_lambda_ggx():
+	alphaSqr_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alphaSqr_ref = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	cosT_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True) * 3.0 - 1
+	cosT_cuda = cosT_cuda.clone().detach().requires_grad_(True)
+	cosT_ref = cosT_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+
+	ref = ru._lambda_ggx(alphaSqr_ref, cosT_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru._lambda_ggx(alphaSqr_cuda, cosT_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Lambda GGX")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("alpha:", alphaSqr_ref.grad, alphaSqr_cuda.grad)
+	relative_loss("cosT:", cosT_ref.grad, cosT_cuda.grad)
+
+def test_masking_smith():
+	alphaSqr_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alphaSqr_ref = alphaSqr_cuda.clone().detach().requires_grad_(True)
+	cosThetaI_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	cosThetaI_ref = cosThetaI_cuda.clone().detach().requires_grad_(True)
+	cosThetaO_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	cosThetaO_ref = cosThetaO_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+
+	ref = ru._masking_smith(alphaSqr_ref, cosThetaI_ref, cosThetaO_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru._masking_smith(alphaSqr_cuda, cosThetaI_cuda, cosThetaO_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Smith masking term")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("alpha:", alphaSqr_ref.grad, alphaSqr_cuda.grad)
+	relative_loss("cosThetaI:", cosThetaI_ref.grad, cosThetaI_cuda.grad)
+	relative_loss("cosThetaO:", cosThetaO_ref.grad, cosThetaO_cuda.grad)
+
+def test_lambert():
+	normals_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	normals_ref = normals_cuda.clone().detach().requires_grad_(True)
+	wi_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wi_ref = wi_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+
+	ref = ru.lambert(normals_ref, wi_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru.lambert(normals_cuda, wi_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Lambert")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("nrm:", normals_ref.grad, normals_cuda.grad)
+	relative_loss("wi:", wi_ref.grad, wi_cuda.grad)
+
+def test_frostbite():
+	normals_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	normals_ref = normals_cuda.clone().detach().requires_grad_(True)
+	wi_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wi_ref = wi_cuda.clone().detach().requires_grad_(True)
+	wo_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wo_ref = wo_cuda.clone().detach().requires_grad_(True)
+	rough_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	rough_ref = rough_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda')
+
+	ref = ru.frostbite_diffuse(normals_ref, wi_ref, wo_ref, rough_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru.frostbite_diffuse(normals_cuda, wi_cuda, wo_cuda, rough_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Frostbite")
+	print("-------------------------------------------------------------")
+	relative_loss("res:", ref, cuda)
+	relative_loss("nrm:", normals_ref.grad, normals_cuda.grad)
+	relative_loss("wo:", wo_ref.grad, wo_cuda.grad)
+	relative_loss("wi:", wi_ref.grad, wi_cuda.grad)
+	relative_loss("rough:", rough_ref.grad, rough_cuda.grad)
+
+def test_pbr_specular():
+	col_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	col_ref = col_cuda.clone().detach().requires_grad_(True)
+	nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	nrm_ref = nrm_cuda.clone().detach().requires_grad_(True)
+	wi_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wi_ref = wi_cuda.clone().detach().requires_grad_(True)
+	wo_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	wo_ref = wo_cuda.clone().detach().requires_grad_(True)
+	alpha_cuda = torch.rand(1, RES, RES, 1, dtype=DTYPE, device='cuda', requires_grad=True)
+	alpha_ref = alpha_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+
+	ref = ru.pbr_specular(col_ref, nrm_ref, wo_ref, wi_ref, alpha_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru.pbr_specular(col_cuda, nrm_cuda, wo_cuda, wi_cuda, alpha_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Pbr specular")
+	print("-------------------------------------------------------------")
+
+	relative_loss("res:", ref, cuda)
+	if col_ref.grad is not None:
+		relative_loss("col:", col_ref.grad, col_cuda.grad)
+	if nrm_ref.grad is not None:
+		relative_loss("nrm:", nrm_ref.grad, nrm_cuda.grad)
+	if wi_ref.grad is not None:
+		relative_loss("wi:", wi_ref.grad, wi_cuda.grad)
+	if wo_ref.grad is not None:
+		relative_loss("wo:", wo_ref.grad, wo_cuda.grad)
+	if alpha_ref.grad is not None:
+		relative_loss("alpha:", alpha_ref.grad, alpha_cuda.grad)
+
+def test_pbr_bsdf(bsdf):
+	kd_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	kd_ref = kd_cuda.clone().detach().requires_grad_(True)
+	arm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	arm_ref = arm_cuda.clone().detach().requires_grad_(True)
+	pos_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	pos_ref = pos_cuda.clone().detach().requires_grad_(True)
+	nrm_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	nrm_ref = nrm_cuda.clone().detach().requires_grad_(True)
+	view_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	view_ref = view_cuda.clone().detach().requires_grad_(True)
+	light_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	light_ref = light_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda')
+
+	ref = ru.pbr_bsdf(kd_ref, arm_ref, pos_ref, nrm_ref, view_ref, light_ref, use_python=True, bsdf=bsdf)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru.pbr_bsdf(kd_cuda, arm_cuda, pos_cuda, nrm_cuda, view_cuda, light_cuda, bsdf=bsdf)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Pbr BSDF")
+	print("-------------------------------------------------------------")
+
+	relative_loss("res:", ref, cuda)
+	if kd_ref.grad is not None:
+		relative_loss("kd:", kd_ref.grad, kd_cuda.grad)
+	if arm_ref.grad is not None:
+		relative_loss("arm:", arm_ref.grad, arm_cuda.grad)
+	if pos_ref.grad is not None:
+		relative_loss("pos:", pos_ref.grad, pos_cuda.grad)
+	if nrm_ref.grad is not None:
+		relative_loss("nrm:", nrm_ref.grad, nrm_cuda.grad)
+	if view_ref.grad is not None:
+		relative_loss("view:", view_ref.grad, view_cuda.grad)
+	if light_ref.grad is not None:
+		relative_loss("light:", light_ref.grad, light_cuda.grad)
+
+test_normal()
+
+test_schlick()
+test_ndf_ggx()
+test_lambda_ggx()
+test_masking_smith()
+
+test_lambert()
+test_frostbite()
+test_pbr_specular()
+test_pbr_bsdf('lambert')
+test_pbr_bsdf('frostbite')
--- a/render/renderutils/tests/test_cubemap.py
+++ b/render/renderutils/tests/test_cubemap.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import torch
+
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+
+RES = 4
+DTYPE = torch.float32
+
+def relative_loss(name, ref, cuda):
+	ref = ref.float()
+	cuda = cuda.float()
+	print(name, torch.max(torch.abs(ref - cuda) / torch.abs(ref + 1e-7)).item())
+
+def test_cubemap():
+	cubemap_cuda = torch.rand(6, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	cubemap_ref = cubemap_cuda.clone().detach().requires_grad_(True)
+	weights = torch.rand(3, 3, 1, dtype=DTYPE, device='cuda')
+	target = torch.rand(6, RES, RES, 3, dtype=DTYPE, device='cuda')
+
+	ref = ru.filter_cubemap(cubemap_ref, weights, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref, target)
+	ref_loss.backward()
+
+	cuda = ru.filter_cubemap(cubemap_cuda, weights, use_python=False)
+	cuda_loss = torch.nn.MSELoss()(cuda, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Cubemap:")
+	print("-------------------------------------------------------------")
+
+	relative_loss("flt:", ref, cuda)
+	relative_loss("cubemap:", cubemap_ref.grad, cubemap_cuda.grad)
+
+
+test_cubemap()
--- a/render/renderutils/tests/test_loss.py
+++ b/render/renderutils/tests/test_loss.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import torch
+
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+
+RES = 8
+DTYPE = torch.float32
+
+def tonemap_srgb(f):
+    return torch.where(f > 0.0031308, torch.pow(torch.clamp(f, min=0.0031308), 1.0/2.4)*1.055 - 0.055, 12.92*f)
+
+def l1(output, target):
+    x = torch.clamp(output, min=0, max=65535)
+    r = torch.clamp(target, min=0, max=65535)
+    x = tonemap_srgb(torch.log(x + 1))
+    r = tonemap_srgb(torch.log(r + 1))
+    return torch.nn.functional.l1_loss(x,r)
+
+def relative_loss(name, ref, cuda):
+	ref = ref.float()
+	cuda = cuda.float()
+	print(name, torch.max(torch.abs(ref - cuda) / torch.abs(ref + 1e-7)).item())
+
+def test_loss(loss, tonemapper):
+	img_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	img_ref = img_cuda.clone().detach().requires_grad_(True)
+	target_cuda = torch.rand(1, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	target_ref = target_cuda.clone().detach().requires_grad_(True)
+
+	ref_loss = ru.image_loss(img_ref, target_ref, loss=loss, tonemapper=tonemapper, use_python=True)
+	ref_loss.backward()
+
+	cuda_loss = ru.image_loss(img_cuda, target_cuda, loss=loss, tonemapper=tonemapper)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+	print("    Loss: %s, %s" % (loss, tonemapper))
+	print("-------------------------------------------------------------")
+
+	relative_loss("res:", ref_loss, cuda_loss)
+	relative_loss("img:", img_ref.grad, img_cuda.grad)
+	relative_loss("target:", target_ref.grad, target_cuda.grad)
+
+
+test_loss('l1', 'none')
+test_loss('l1', 'log_srgb')
+test_loss('mse', 'log_srgb')
+test_loss('smape', 'none')
+test_loss('relmse', 'none')
+test_loss('mse', 'none')
--- a/render/renderutils/tests/test_mesh.py
+++ b/render/renderutils/tests/test_mesh.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import torch
+
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+
+BATCH = 8
+RES = 1024
+DTYPE = torch.float32
+
+torch.manual_seed(0)
+
+def tonemap_srgb(f):
+    return torch.where(f > 0.0031308, torch.pow(torch.clamp(f, min=0.0031308), 1.0/2.4)*1.055 - 0.055, 12.92*f)
+
+def l1(output, target):
+    x = torch.clamp(output, min=0, max=65535)
+    r = torch.clamp(target, min=0, max=65535)
+    x = tonemap_srgb(torch.log(x + 1))
+    r = tonemap_srgb(torch.log(r + 1))
+    return torch.nn.functional.l1_loss(x,r)
+
+def relative_loss(name, ref, cuda):
+	ref = ref.float()
+	cuda = cuda.float()
+	print(name, torch.max(torch.abs(ref - cuda) / torch.abs(ref)).item())
+
+def test_xfm_points():
+	points_cuda = torch.rand(1, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	points_ref = points_cuda.clone().detach().requires_grad_(True)
+	mtx_cuda = torch.rand(BATCH, 4, 4, dtype=DTYPE, device='cuda', requires_grad=False)
+	mtx_ref = mtx_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(BATCH, RES, 4, dtype=DTYPE, device='cuda', requires_grad=True)
+
+	ref_out = ru.xfm_points(points_ref, mtx_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref_out, target)
+	ref_loss.backward()
+
+	cuda_out = ru.xfm_points(points_cuda, mtx_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda_out, target)
+	cuda_loss.backward()
+
+	print("-------------------------------------------------------------")
+
+	relative_loss("res:", ref_out, cuda_out)
+	relative_loss("points:", points_ref.grad, points_cuda.grad)
+
+def test_xfm_vectors():
+	points_cuda = torch.rand(1, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	points_ref = points_cuda.clone().detach().requires_grad_(True)
+	points_cuda_p = points_cuda.clone().detach().requires_grad_(True)
+	points_ref_p = points_cuda.clone().detach().requires_grad_(True)
+	mtx_cuda = torch.rand(BATCH, 4, 4, dtype=DTYPE, device='cuda', requires_grad=False)
+	mtx_ref = mtx_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(BATCH, RES, 4, dtype=DTYPE, device='cuda', requires_grad=True)
+
+	ref_out = ru.xfm_vectors(points_ref.contiguous(), mtx_ref, use_python=True)
+	ref_loss = torch.nn.MSELoss()(ref_out, target[..., 0:3])
+	ref_loss.backward()
+
+	cuda_out = ru.xfm_vectors(points_cuda.contiguous(), mtx_cuda)
+	cuda_loss = torch.nn.MSELoss()(cuda_out, target[..., 0:3])
+	cuda_loss.backward()
+
+	ref_out_p = ru.xfm_points(points_ref_p.contiguous(), mtx_ref, use_python=True)
+	ref_loss_p = torch.nn.MSELoss()(ref_out_p, target)
+	ref_loss_p.backward()
+	
+	cuda_out_p = ru.xfm_points(points_cuda_p.contiguous(), mtx_cuda)
+	cuda_loss_p = torch.nn.MSELoss()(cuda_out_p, target)
+	cuda_loss_p.backward()
+
+	print("-------------------------------------------------------------")
+
+	relative_loss("res:", ref_out, cuda_out)
+	relative_loss("points:", points_ref.grad, points_cuda.grad)
+	relative_loss("points_p:", points_ref_p.grad, points_cuda_p.grad)
+
+test_xfm_points()
+test_xfm_vectors()
--- a/render/renderutils/tests/test_perf.py
+++ b/render/renderutils/tests/test_perf.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction, 
+# disclosure or distribution of this material and related documentation 
+# without an express license agreement from NVIDIA CORPORATION or 
+# its affiliates is strictly prohibited.
+
+import torch
+
+import os
+import sys
+sys.path.insert(0, os.path.join(sys.path[0], '../..'))
+import renderutils as ru
+
+DTYPE=torch.float32
+
+def test_bsdf(BATCH, RES, ITR):
+	kd_cuda = torch.rand(BATCH, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	kd_ref = kd_cuda.clone().detach().requires_grad_(True)
+	arm_cuda = torch.rand(BATCH, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	arm_ref = arm_cuda.clone().detach().requires_grad_(True)
+	pos_cuda = torch.rand(BATCH, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	pos_ref = pos_cuda.clone().detach().requires_grad_(True)
+	nrm_cuda = torch.rand(BATCH, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	nrm_ref = nrm_cuda.clone().detach().requires_grad_(True)
+	view_cuda = torch.rand(BATCH, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	view_ref = view_cuda.clone().detach().requires_grad_(True)
+	light_cuda = torch.rand(BATCH, RES, RES, 3, dtype=DTYPE, device='cuda', requires_grad=True)
+	light_ref = light_cuda.clone().detach().requires_grad_(True)
+	target = torch.rand(BATCH, RES, RES, 3, device='cuda')
+
+	start = torch.cuda.Event(enable_timing=True)
+	end = torch.cuda.Event(enable_timing=True)
+
+	ru.pbr_bsdf(kd_cuda, arm_cuda, pos_cuda, nrm_cuda, view_cuda, light_cuda)
+
+	print("--- Testing: [%d, %d, %d] ---" % (BATCH, RES, RES))
+
+	start.record()
+	for i in range(ITR):
+		ref = ru.pbr_bsdf(kd_ref, arm_ref, pos_ref, nrm_ref, view_ref, light_ref, use_python=True)
+	end.record()
+	torch.cuda.synchronize()
+	print("Pbr BSDF python:", start.elapsed_time(end))
+
+	start.record()
+	for i in range(ITR):
+		cuda = ru.pbr_bsdf(kd_cuda, arm_cuda, pos_cuda, nrm_cuda, view_cuda, light_cuda)
+	end.record()
+	torch.cuda.synchronize()
+	print("Pbr BSDF cuda:", start.elapsed_time(end))
+
+test_bsdf(1, 512, 1000)
+test_bsdf(16, 512, 1000)
+test_bsdf(1, 2048, 1000)