first commit
This commit is contained in:
710
render/renderutils/c_src/bsdf.cu
Normal file
710
render/renderutils/c_src/bsdf.cu
Normal file
@@ -0,0 +1,710 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
#include "bsdf.h"
|
||||
|
||||
#define SPECULAR_EPSILON 1e-4f
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Lambert functions
|
||||
|
||||
__device__ inline float fwdLambert(const vec3f nrm, const vec3f wi)
|
||||
{
|
||||
return max(dot(nrm, wi) / M_PI, 0.0f);
|
||||
}
|
||||
|
||||
__device__ inline void bwdLambert(const vec3f nrm, const vec3f wi, vec3f& d_nrm, vec3f& d_wi, const float d_out)
|
||||
{
|
||||
if (dot(nrm, wi) > 0.0f)
|
||||
bwdDot(nrm, wi, d_nrm, d_wi, d_out / M_PI);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Fresnel Schlick
|
||||
|
||||
__device__ inline float fwdFresnelSchlick(const float f0, const float f90, const float cosTheta)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float scale = powf(1.0f - _cosTheta, 5.0f);
|
||||
return f0 * (1.0f - scale) + f90 * scale;
|
||||
}
|
||||
|
||||
__device__ inline void bwdFresnelSchlick(const float f0, const float f90, const float cosTheta, float& d_f0, float& d_f90, float& d_cosTheta, const float d_out)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
|
||||
d_f0 += d_out * (1.0 - scale);
|
||||
d_f90 += d_out * scale;
|
||||
if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
|
||||
{
|
||||
d_cosTheta += d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline vec3f fwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float scale = powf(1.0f - _cosTheta, 5.0f);
|
||||
return f0 * (1.0f - scale) + f90 * scale;
|
||||
}
|
||||
|
||||
__device__ inline void bwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta, vec3f& d_f0, vec3f& d_f90, float& d_cosTheta, const vec3f d_out)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
|
||||
d_f0 += d_out * (1.0 - scale);
|
||||
d_f90 += d_out * scale;
|
||||
if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
|
||||
{
|
||||
d_cosTheta += sum(d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f));
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Frostbite diffuse
|
||||
|
||||
__device__ inline float fwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness)
|
||||
{
|
||||
float wiDotN = dot(wi, nrm);
|
||||
float woDotN = dot(wo, nrm);
|
||||
if (wiDotN > 0.0f && woDotN > 0.0f)
|
||||
{
|
||||
vec3f h = safeNormalize(wo + wi);
|
||||
float wiDotH = dot(wi, h);
|
||||
|
||||
float energyBias = 0.5f * linearRoughness;
|
||||
float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
|
||||
float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
|
||||
float f0 = 1.f;
|
||||
|
||||
float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
|
||||
float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
|
||||
|
||||
return wiScatter * woScatter * energyFactor;
|
||||
}
|
||||
else return 0.0f;
|
||||
}
|
||||
|
||||
__device__ inline void bwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness, vec3f& d_nrm, vec3f& d_wi, vec3f& d_wo, float &d_linearRoughness, const float d_out)
|
||||
{
|
||||
float wiDotN = dot(wi, nrm);
|
||||
float woDotN = dot(wo, nrm);
|
||||
|
||||
if (wiDotN > 0.0f && woDotN > 0.0f)
|
||||
{
|
||||
vec3f h = safeNormalize(wo + wi);
|
||||
float wiDotH = dot(wi, h);
|
||||
|
||||
float energyBias = 0.5f * linearRoughness;
|
||||
float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
|
||||
float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
|
||||
float f0 = 1.f;
|
||||
|
||||
float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
|
||||
float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
|
||||
|
||||
// -------------- BWD --------------
|
||||
// Backprop: return wiScatter * woScatter * energyFactor;
|
||||
float d_wiScatter = d_out * woScatter * energyFactor;
|
||||
float d_woScatter = d_out * wiScatter * energyFactor;
|
||||
float d_energyFactor = d_out * wiScatter * woScatter;
|
||||
|
||||
// Backprop: float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
|
||||
float d_woDotN = 0.0f, d_f0 = 0.0, d_f90 = 0.0f;
|
||||
bwdFresnelSchlick(f0, f90, woDotN, d_f0, d_f90, d_woDotN, d_woScatter);
|
||||
|
||||
// Backprop: float wiScatter = fwdFresnelSchlick(fd0, fd90, wiDotN);
|
||||
float d_wiDotN = 0.0f;
|
||||
bwdFresnelSchlick(f0, f90, wiDotN, d_f0, d_f90, d_wiDotN, d_wiScatter);
|
||||
|
||||
// Backprop: float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
|
||||
float d_energyBias = d_f90;
|
||||
float d_wiDotH = d_f90 * 4 * wiDotH * linearRoughness;
|
||||
d_linearRoughness += d_f90 * 2 * wiDotH * wiDotH;
|
||||
|
||||
// Backprop: float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
|
||||
d_linearRoughness -= (0.51f / 1.51f) * d_energyFactor;
|
||||
|
||||
// Backprop: float energyBias = 0.5f * linearRoughness;
|
||||
d_linearRoughness += 0.5 * d_energyBias;
|
||||
|
||||
// Backprop: float wiDotH = dot(wi, h);
|
||||
vec3f d_h(0);
|
||||
bwdDot(wi, h, d_wi, d_h, d_wiDotH);
|
||||
|
||||
// Backprop: vec3f h = safeNormalize(wo + wi);
|
||||
vec3f d_wo_wi(0);
|
||||
bwdSafeNormalize(wo + wi, d_wo_wi, d_h);
|
||||
d_wi += d_wo_wi; d_wo += d_wo_wi;
|
||||
|
||||
bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
|
||||
bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Ndf GGX
|
||||
|
||||
__device__ inline float fwdNdfGGX(const float alphaSqr, const float cosTheta)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
|
||||
return alphaSqr / (d * d * M_PI);
|
||||
}
|
||||
|
||||
__device__ inline void bwdNdfGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
|
||||
{
|
||||
// Torch only back propagates if clamp doesn't trigger
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float cosThetaSqr = _cosTheta * _cosTheta;
|
||||
d_alphaSqr += d_out * (1.0f - (alphaSqr + 1.0f) * cosThetaSqr) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
|
||||
if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
|
||||
{
|
||||
d_cosTheta += d_out * -(4.0f * (alphaSqr - 1.0f) * alphaSqr * cosTheta) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Lambda GGX
|
||||
|
||||
__device__ inline float fwdLambdaGGX(const float alphaSqr, const float cosTheta)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float cosThetaSqr = _cosTheta * _cosTheta;
|
||||
float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
|
||||
float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void bwdLambdaGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
|
||||
float cosThetaSqr = _cosTheta * _cosTheta;
|
||||
float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
|
||||
float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
|
||||
|
||||
d_alphaSqr += d_out * (0.25 * tanThetaSqr) / sqrtf(alphaSqr * tanThetaSqr + 1.0f);
|
||||
if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
|
||||
d_cosTheta += d_out * -(0.5 * alphaSqr) / (powf(_cosTheta, 3.0f) * sqrtf(alphaSqr / cosThetaSqr - alphaSqr + 1.0f));
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Masking GGX
|
||||
|
||||
__device__ inline float fwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO)
|
||||
{
|
||||
float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
|
||||
float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
|
||||
return 1.0f / (1.0f + lambdaI + lambdaO);
|
||||
}
|
||||
|
||||
__device__ inline void bwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO, float& d_alphaSqr, float& d_cosThetaI, float& d_cosThetaO, const float d_out)
|
||||
{
|
||||
// FWD eval
|
||||
float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
|
||||
float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
|
||||
|
||||
// BWD eval
|
||||
float d_lambdaIO = -d_out / powf(1.0f + lambdaI + lambdaO, 2.0f);
|
||||
bwdLambdaGGX(alphaSqr, cosThetaI, d_alphaSqr, d_cosThetaI, d_lambdaIO);
|
||||
bwdLambdaGGX(alphaSqr, cosThetaO, d_alphaSqr, d_cosThetaO, d_lambdaIO);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// GGX specular
|
||||
|
||||
__device__ vec3f fwdPbrSpecular(const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness)
|
||||
{
|
||||
float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
|
||||
float alphaSqr = _alpha * _alpha;
|
||||
|
||||
vec3f h = safeNormalize(wo + wi);
|
||||
float woDotN = dot(wo, nrm);
|
||||
float wiDotN = dot(wi, nrm);
|
||||
float woDotH = dot(wo, h);
|
||||
float nDotH = dot(nrm, h);
|
||||
|
||||
float D = fwdNdfGGX(alphaSqr, nDotH);
|
||||
float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
|
||||
vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
|
||||
vec3f w = F * D * G * 0.25 / woDotN;
|
||||
|
||||
bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
|
||||
return frontfacing ? w : 0.0f;
|
||||
}
|
||||
|
||||
__device__ void bwdPbrSpecular(
|
||||
const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness,
|
||||
vec3f& d_col, vec3f& d_nrm, vec3f& d_wo, vec3f& d_wi, float& d_alpha, const vec3f d_out)
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// FWD eval
|
||||
|
||||
float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
|
||||
float alphaSqr = _alpha * _alpha;
|
||||
|
||||
vec3f h = safeNormalize(wo + wi);
|
||||
float woDotN = dot(wo, nrm);
|
||||
float wiDotN = dot(wi, nrm);
|
||||
float woDotH = dot(wo, h);
|
||||
float nDotH = dot(nrm, h);
|
||||
|
||||
float D = fwdNdfGGX(alphaSqr, nDotH);
|
||||
float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
|
||||
vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
|
||||
vec3f w = F * D * G * 0.25 / woDotN;
|
||||
bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
|
||||
|
||||
if (frontfacing)
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// BWD eval
|
||||
|
||||
vec3f d_F = d_out * D * G * 0.25f / woDotN;
|
||||
float d_D = sum(d_out * F * G * 0.25f / woDotN);
|
||||
float d_G = sum(d_out * F * D * 0.25f / woDotN);
|
||||
|
||||
float d_woDotN = -sum(d_out * F * D * G * 0.25f / (woDotN * woDotN));
|
||||
|
||||
vec3f d_f90(0);
|
||||
float d_woDotH(0), d_wiDotN(0), d_nDotH(0), d_alphaSqr(0);
|
||||
bwdFresnelSchlick(col, 1.0f, woDotH, d_col, d_f90, d_woDotH, d_F);
|
||||
bwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN, d_alphaSqr, d_woDotN, d_wiDotN, d_G);
|
||||
bwdNdfGGX(alphaSqr, nDotH, d_alphaSqr, d_nDotH, d_D);
|
||||
|
||||
vec3f d_h(0);
|
||||
bwdDot(nrm, h, d_nrm, d_h, d_nDotH);
|
||||
bwdDot(wo, h, d_wo, d_h, d_woDotH);
|
||||
bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
|
||||
bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
|
||||
|
||||
vec3f d_h_unnorm(0);
|
||||
bwdSafeNormalize(wo + wi, d_h_unnorm, d_h);
|
||||
d_wo += d_h_unnorm;
|
||||
d_wi += d_h_unnorm;
|
||||
|
||||
if (alpha > min_roughness * min_roughness)
|
||||
d_alpha += d_alphaSqr * 2 * alpha;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Full PBR BSDF
|
||||
|
||||
__device__ vec3f fwdPbrBSDF(const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF)
|
||||
{
|
||||
vec3f wo = safeNormalize(view_pos - pos);
|
||||
vec3f wi = safeNormalize(light_pos - pos);
|
||||
|
||||
float alpha = arm.y * arm.y;
|
||||
vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
|
||||
vec3f diff_col = kd * (1.0f - arm.z);
|
||||
|
||||
float diff = 0.0f;
|
||||
if (BSDF == 0)
|
||||
diff = fwdLambert(nrm, wi);
|
||||
else
|
||||
diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);
|
||||
vec3f diffuse = diff_col * diff;
|
||||
vec3f specular = fwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness);
|
||||
|
||||
return diffuse + specular;
|
||||
}
|
||||
|
||||
__device__ void bwdPbrBSDF(
|
||||
const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF,
|
||||
vec3f& d_kd, vec3f& d_arm, vec3f& d_pos, vec3f& d_nrm, vec3f& d_view_pos, vec3f& d_light_pos, const vec3f d_out)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FWD
|
||||
vec3f _wi = light_pos - pos;
|
||||
vec3f _wo = view_pos - pos;
|
||||
vec3f wi = safeNormalize(_wi);
|
||||
vec3f wo = safeNormalize(_wo);
|
||||
|
||||
float alpha = arm.y * arm.y;
|
||||
vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
|
||||
vec3f diff_col = kd * (1.0f - arm.z);
|
||||
float diff = 0.0f;
|
||||
if (BSDF == 0)
|
||||
diff = fwdLambert(nrm, wi);
|
||||
else
|
||||
diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// BWD
|
||||
|
||||
float d_alpha(0);
|
||||
vec3f d_spec_col(0), d_wi(0), d_wo(0);
|
||||
bwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness, d_spec_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
|
||||
|
||||
float d_diff = sum(diff_col * d_out);
|
||||
if (BSDF == 0)
|
||||
bwdLambert(nrm, wi, d_nrm, d_wi, d_diff);
|
||||
else
|
||||
bwdFrostbiteDiffuse(nrm, wi, wo, arm.y, d_nrm, d_wi, d_wo, d_arm.y, d_diff);
|
||||
|
||||
// Backprop: diff_col = kd * (1.0f - arm.z)
|
||||
vec3f d_diff_col = d_out * diff;
|
||||
d_kd += d_diff_col * (1.0f - arm.z);
|
||||
d_arm.z -= sum(d_diff_col * kd);
|
||||
|
||||
// Backprop: spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x)
|
||||
d_kd -= d_spec_col * (arm.x - 1.0f) * arm.z;
|
||||
d_arm.x += sum(d_spec_col * (arm.z * (0.04f - kd) - 0.04f));
|
||||
d_arm.z -= sum(d_spec_col * (kd - 0.04f) * (arm.x - 1.0f));
|
||||
|
||||
// Backprop: alpha = arm.y * arm.y
|
||||
d_arm.y += d_alpha * 2 * arm.y;
|
||||
|
||||
// Backprop: vec3f wi = safeNormalize(light_pos - pos);
|
||||
vec3f d__wi(0);
|
||||
bwdSafeNormalize(_wi, d__wi, d_wi);
|
||||
d_light_pos += d__wi;
|
||||
d_pos -= d__wi;
|
||||
|
||||
// Backprop: vec3f wo = safeNormalize(view_pos - pos);
|
||||
vec3f d__wo(0);
|
||||
bwdSafeNormalize(_wo, d__wo, d_wo);
|
||||
d_view_pos += d__wo;
|
||||
d_pos -= d__wo;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Kernels
|
||||
|
||||
__global__ void LambertFwdKernel(LambertKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f wi = p.wi.fetch3(px, py, pz);
|
||||
|
||||
float res = fwdLambert(nrm, wi);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void LambertBwdKernel(LambertKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f wi = p.wi.fetch3(px, py, pz);
|
||||
float d_out = p.out.fetch1(px, py, pz);
|
||||
|
||||
vec3f d_nrm(0), d_wi(0);
|
||||
bwdLambert(nrm, wi, d_nrm, d_wi, d_out);
|
||||
|
||||
p.nrm.store_grad(px, py, pz, d_nrm);
|
||||
p.wi.store_grad(px, py, pz, d_wi);
|
||||
}
|
||||
|
||||
__global__ void FrostbiteDiffuseFwdKernel(FrostbiteDiffuseKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f wi = p.wi.fetch3(px, py, pz);
|
||||
vec3f wo = p.wo.fetch3(px, py, pz);
|
||||
float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
|
||||
|
||||
float res = fwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void FrostbiteDiffuseBwdKernel(FrostbiteDiffuseKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f wi = p.wi.fetch3(px, py, pz);
|
||||
vec3f wo = p.wo.fetch3(px, py, pz);
|
||||
float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
|
||||
float d_out = p.out.fetch1(px, py, pz);
|
||||
|
||||
float d_linearRoughness = 0.0f;
|
||||
vec3f d_nrm(0), d_wi(0), d_wo(0);
|
||||
bwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness, d_nrm, d_wi, d_wo, d_linearRoughness, d_out);
|
||||
|
||||
p.nrm.store_grad(px, py, pz, d_nrm);
|
||||
p.wi.store_grad(px, py, pz, d_wi);
|
||||
p.wo.store_grad(px, py, pz, d_wo);
|
||||
p.linearRoughness.store_grad(px, py, pz, d_linearRoughness);
|
||||
}
|
||||
|
||||
__global__ void FresnelShlickFwdKernel(FresnelShlickKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f f0 = p.f0.fetch3(px, py, pz);
|
||||
vec3f f90 = p.f90.fetch3(px, py, pz);
|
||||
float cosTheta = p.cosTheta.fetch1(px, py, pz);
|
||||
|
||||
vec3f res = fwdFresnelSchlick(f0, f90, cosTheta);
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void FresnelShlickBwdKernel(FresnelShlickKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f f0 = p.f0.fetch3(px, py, pz);
|
||||
vec3f f90 = p.f90.fetch3(px, py, pz);
|
||||
float cosTheta = p.cosTheta.fetch1(px, py, pz);
|
||||
vec3f d_out = p.out.fetch3(px, py, pz);
|
||||
|
||||
vec3f d_f0(0), d_f90(0);
|
||||
float d_cosTheta(0);
|
||||
bwdFresnelSchlick(f0, f90, cosTheta, d_f0, d_f90, d_cosTheta, d_out);
|
||||
|
||||
p.f0.store_grad(px, py, pz, d_f0);
|
||||
p.f90.store_grad(px, py, pz, d_f90);
|
||||
p.cosTheta.store_grad(px, py, pz, d_cosTheta);
|
||||
}
|
||||
|
||||
__global__ void ndfGGXFwdKernel(NdfGGXParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
|
||||
float cosTheta = p.cosTheta.fetch1(px, py, pz);
|
||||
float res = fwdNdfGGX(alphaSqr, cosTheta);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void ndfGGXBwdKernel(NdfGGXParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
|
||||
float cosTheta = p.cosTheta.fetch1(px, py, pz);
|
||||
float d_out = p.out.fetch1(px, py, pz);
|
||||
|
||||
float d_alphaSqr(0), d_cosTheta(0);
|
||||
bwdNdfGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
|
||||
|
||||
p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
|
||||
p.cosTheta.store_grad(px, py, pz, d_cosTheta);
|
||||
}
|
||||
|
||||
__global__ void lambdaGGXFwdKernel(NdfGGXParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
|
||||
float cosTheta = p.cosTheta.fetch1(px, py, pz);
|
||||
float res = fwdLambdaGGX(alphaSqr, cosTheta);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void lambdaGGXBwdKernel(NdfGGXParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
|
||||
float cosTheta = p.cosTheta.fetch1(px, py, pz);
|
||||
float d_out = p.out.fetch1(px, py, pz);
|
||||
|
||||
float d_alphaSqr(0), d_cosTheta(0);
|
||||
bwdLambdaGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
|
||||
|
||||
p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
|
||||
p.cosTheta.store_grad(px, py, pz, d_cosTheta);
|
||||
}
|
||||
|
||||
__global__ void maskingSmithFwdKernel(MaskingSmithParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
|
||||
float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
|
||||
float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
|
||||
float res = fwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void maskingSmithBwdKernel(MaskingSmithParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
|
||||
float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
|
||||
float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
|
||||
float d_out = p.out.fetch1(px, py, pz);
|
||||
|
||||
float d_alphaSqr(0), d_cosThetaI(0), d_cosThetaO(0);
|
||||
bwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO, d_alphaSqr, d_cosThetaI, d_cosThetaO, d_out);
|
||||
|
||||
p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
|
||||
p.cosThetaI.store_grad(px, py, pz, d_cosThetaI);
|
||||
p.cosThetaO.store_grad(px, py, pz, d_cosThetaO);
|
||||
}
|
||||
|
||||
__global__ void pbrSpecularFwdKernel(PbrSpecular p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f col = p.col.fetch3(px, py, pz);
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f wo = p.wo.fetch3(px, py, pz);
|
||||
vec3f wi = p.wi.fetch3(px, py, pz);
|
||||
float alpha = p.alpha.fetch1(px, py, pz);
|
||||
|
||||
vec3f res = fwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void pbrSpecularBwdKernel(PbrSpecular p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f col = p.col.fetch3(px, py, pz);
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f wo = p.wo.fetch3(px, py, pz);
|
||||
vec3f wi = p.wi.fetch3(px, py, pz);
|
||||
float alpha = p.alpha.fetch1(px, py, pz);
|
||||
vec3f d_out = p.out.fetch3(px, py, pz);
|
||||
|
||||
float d_alpha(0);
|
||||
vec3f d_col(0), d_nrm(0), d_wo(0), d_wi(0);
|
||||
bwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness, d_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
|
||||
|
||||
p.col.store_grad(px, py, pz, d_col);
|
||||
p.nrm.store_grad(px, py, pz, d_nrm);
|
||||
p.wo.store_grad(px, py, pz, d_wo);
|
||||
p.wi.store_grad(px, py, pz, d_wi);
|
||||
p.alpha.store_grad(px, py, pz, d_alpha);
|
||||
}
|
||||
|
||||
__global__ void pbrBSDFFwdKernel(PbrBSDF p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f kd = p.kd.fetch3(px, py, pz);
|
||||
vec3f arm = p.arm.fetch3(px, py, pz);
|
||||
vec3f pos = p.pos.fetch3(px, py, pz);
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f view_pos = p.view_pos.fetch3(px, py, pz);
|
||||
vec3f light_pos = p.light_pos.fetch3(px, py, pz);
|
||||
|
||||
vec3f res = fwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
__global__ void pbrBSDFBwdKernel(PbrBSDF p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f kd = p.kd.fetch3(px, py, pz);
|
||||
vec3f arm = p.arm.fetch3(px, py, pz);
|
||||
vec3f pos = p.pos.fetch3(px, py, pz);
|
||||
vec3f nrm = p.nrm.fetch3(px, py, pz);
|
||||
vec3f view_pos = p.view_pos.fetch3(px, py, pz);
|
||||
vec3f light_pos = p.light_pos.fetch3(px, py, pz);
|
||||
vec3f d_out = p.out.fetch3(px, py, pz);
|
||||
|
||||
vec3f d_kd(0), d_arm(0), d_pos(0), d_nrm(0), d_view_pos(0), d_light_pos(0);
|
||||
bwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF, d_kd, d_arm, d_pos, d_nrm, d_view_pos, d_light_pos, d_out);
|
||||
|
||||
p.kd.store_grad(px, py, pz, d_kd);
|
||||
p.arm.store_grad(px, py, pz, d_arm);
|
||||
p.pos.store_grad(px, py, pz, d_pos);
|
||||
p.nrm.store_grad(px, py, pz, d_nrm);
|
||||
p.view_pos.store_grad(px, py, pz, d_view_pos);
|
||||
p.light_pos.store_grad(px, py, pz, d_light_pos);
|
||||
}
|
||||
84
render/renderutils/c_src/bsdf.h
Normal file
84
render/renderutils/c_src/bsdf.h
Normal file
@@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct LambertKernelParams
|
||||
{
|
||||
Tensor nrm;
|
||||
Tensor wi;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
|
||||
struct FrostbiteDiffuseKernelParams
|
||||
{
|
||||
Tensor nrm;
|
||||
Tensor wi;
|
||||
Tensor wo;
|
||||
Tensor linearRoughness;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
|
||||
struct FresnelShlickKernelParams
|
||||
{
|
||||
Tensor f0;
|
||||
Tensor f90;
|
||||
Tensor cosTheta;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
|
||||
struct NdfGGXParams
|
||||
{
|
||||
Tensor alphaSqr;
|
||||
Tensor cosTheta;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
|
||||
struct MaskingSmithParams
|
||||
{
|
||||
Tensor alphaSqr;
|
||||
Tensor cosThetaI;
|
||||
Tensor cosThetaO;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
|
||||
struct PbrSpecular
|
||||
{
|
||||
Tensor col;
|
||||
Tensor nrm;
|
||||
Tensor wo;
|
||||
Tensor wi;
|
||||
Tensor alpha;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
float min_roughness;
|
||||
};
|
||||
|
||||
struct PbrBSDF
|
||||
{
|
||||
Tensor kd;
|
||||
Tensor arm;
|
||||
Tensor pos;
|
||||
Tensor nrm;
|
||||
Tensor view_pos;
|
||||
Tensor light_pos;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
float min_roughness;
|
||||
int BSDF;
|
||||
};
|
||||
74
render/renderutils/c_src/common.cpp
Normal file
74
render/renderutils/c_src/common.cpp
Normal file
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <algorithm>
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Block and grid size calculators for kernel launches.
|
||||
|
||||
dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims)
|
||||
{
|
||||
int maxThreads = maxWidth * maxHeight;
|
||||
if (maxThreads <= 1 || (dims.x * dims.y) <= 1)
|
||||
return dim3(1, 1, 1); // Degenerate.
|
||||
|
||||
// Start from max size.
|
||||
int bw = maxWidth;
|
||||
int bh = maxHeight;
|
||||
|
||||
// Optimizations for weirdly sized buffers.
|
||||
if (dims.x < bw)
|
||||
{
|
||||
// Decrease block width to smallest power of two that covers the buffer width.
|
||||
while ((bw >> 1) >= dims.x)
|
||||
bw >>= 1;
|
||||
|
||||
// Maximize height.
|
||||
bh = maxThreads / bw;
|
||||
if (bh > dims.y)
|
||||
bh = dims.y;
|
||||
}
|
||||
else if (dims.y < bh)
|
||||
{
|
||||
// Halve height and double width until fits completely inside buffer vertically.
|
||||
while (bh > dims.y)
|
||||
{
|
||||
bh >>= 1;
|
||||
if (bw < dims.x)
|
||||
bw <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Done.
|
||||
return dim3(bw, bh, 1);
|
||||
}
|
||||
|
||||
// returns the size of a block that can be reduced using horizontal SIMD operations (e.g. __shfl_xor_sync)
|
||||
dim3 getWarpSize(dim3 blockSize)
|
||||
{
|
||||
return dim3(
|
||||
std::min(blockSize.x, 32u),
|
||||
std::min(std::max(32u / blockSize.x, 1u), std::min(32u, blockSize.y)),
|
||||
std::min(std::max(32u / (blockSize.x * blockSize.y), 1u), std::min(32u, blockSize.z))
|
||||
);
|
||||
}
|
||||
|
||||
dim3 getLaunchGridSize(dim3 blockSize, dim3 dims)
|
||||
{
|
||||
dim3 gridSize;
|
||||
gridSize.x = (dims.x - 1) / blockSize.x + 1;
|
||||
gridSize.y = (dims.y - 1) / blockSize.y + 1;
|
||||
gridSize.z = (dims.z - 1) / blockSize.z + 1;
|
||||
return gridSize;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
41
render/renderutils/c_src/common.h
Normal file
41
render/renderutils/c_src/common.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <cuda.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "vec3f.h"
|
||||
#include "vec4f.h"
|
||||
#include "tensor.h"
|
||||
|
||||
dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims);
|
||||
dim3 getLaunchGridSize(dim3 blockSize, dim3 dims);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define M_PI 3.14159265358979323846f
|
||||
#endif
|
||||
|
||||
__host__ __device__ static inline dim3 getWarpSize(dim3 blockSize)
|
||||
{
|
||||
return dim3(
|
||||
min(blockSize.x, 32u),
|
||||
min(max(32u / blockSize.x, 1u), min(32u, blockSize.y)),
|
||||
min(max(32u / (blockSize.x * blockSize.y), 1u), min(32u, blockSize.z))
|
||||
);
|
||||
}
|
||||
|
||||
__device__ static inline float clamp(float val, float mn, float mx) { return min(max(val, mn), mx); }
|
||||
#else
|
||||
dim3 getWarpSize(dim3 blockSize);
|
||||
#endif
|
||||
350
render/renderutils/c_src/cubemap.cu
Normal file
350
render/renderutils/c_src/cubemap.cu
Normal file
@@ -0,0 +1,350 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
#include "cubemap.h"
|
||||
#include <float.h>
|
||||
|
||||
// https://cgvr.cs.uni-bremen.de/teaching/cg_literatur/Spherical,%20Cubic,%20and%20Parabolic%20Environment%20Mappings.pdf
|
||||
__device__ float pixel_area(int x, int y, int N)
|
||||
{
|
||||
if (N > 1)
|
||||
{
|
||||
int H = N / 2;
|
||||
x = abs(x - H);
|
||||
y = abs(y - H);
|
||||
float dx = atan((float)(x + 1) / (float)H) - atan((float)x / (float)H);
|
||||
float dy = atan((float)(y + 1) / (float)H) - atan((float)y / (float)H);
|
||||
return dx * dy;
|
||||
}
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
__device__ vec3f cube_to_dir(int x, int y, int side, int N)
|
||||
{
|
||||
float fx = 2.0f * (((float)x + 0.5f) / (float)N) - 1.0f;
|
||||
float fy = 2.0f * (((float)y + 0.5f) / (float)N) - 1.0f;
|
||||
switch (side)
|
||||
{
|
||||
case 0: return safeNormalize(vec3f(1, -fy, -fx));
|
||||
case 1: return safeNormalize(vec3f(-1, -fy, fx));
|
||||
case 2: return safeNormalize(vec3f(fx, 1, fy));
|
||||
case 3: return safeNormalize(vec3f(fx, -1, -fy));
|
||||
case 4: return safeNormalize(vec3f(fx, -fy, 1));
|
||||
case 5: return safeNormalize(vec3f(-fx, -fy, -1));
|
||||
}
|
||||
return vec3f(0,0,0); // Unreachable
|
||||
}
|
||||
|
||||
__device__ vec3f dir_to_side(int side, vec3f v)
|
||||
{
|
||||
switch (side)
|
||||
{
|
||||
case 0: return vec3f(-v.z, -v.y, v.x);
|
||||
case 1: return vec3f( v.z, -v.y, -v.x);
|
||||
case 2: return vec3f( v.x, v.z, v.y);
|
||||
case 3: return vec3f( v.x, -v.z, -v.y);
|
||||
case 4: return vec3f( v.x, -v.y, v.z);
|
||||
case 5: return vec3f(-v.x, -v.y, -v.z);
|
||||
}
|
||||
return vec3f(0,0,0); // Unreachable
|
||||
}
|
||||
|
||||
__device__ void extents_1d(float x, float z, float theta, float& _min, float& _max)
|
||||
{
|
||||
float l = sqrtf(x * x + z * z);
|
||||
float pxr = x + z * tan(theta) * l, pzr = z - x * tan(theta) * l;
|
||||
float pxl = x - z * tan(theta) * l, pzl = z + x * tan(theta) * l;
|
||||
if (pzl <= 0.00001f)
|
||||
_min = pxl > 0.0f ? FLT_MAX : -FLT_MAX;
|
||||
else
|
||||
_min = pxl / pzl;
|
||||
if (pzr <= 0.00001f)
|
||||
_max = pxr > 0.0f ? FLT_MAX : -FLT_MAX;
|
||||
else
|
||||
_max = pxr / pzr;
|
||||
}
|
||||
|
||||
__device__ void dir_extents(int side, int N, vec3f v, float theta, int &_xmin, int& _xmax, int& _ymin, int& _ymax)
|
||||
{
|
||||
vec3f c = dir_to_side(side, v); // remap to (x,y,z) where side is at z = 1
|
||||
|
||||
if (theta < 0.785398f) // PI/4
|
||||
{
|
||||
float xmin, xmax, ymin, ymax;
|
||||
extents_1d(c.x, c.z, theta, xmin, xmax);
|
||||
extents_1d(c.y, c.z, theta, ymin, ymax);
|
||||
|
||||
if (xmin > 1.0f || xmax < -1.0f || ymin > 1.0f || ymax < -1.0f)
|
||||
{
|
||||
_xmin = -1; _xmax = -1; _ymin = -1; _ymax = -1; // Bad aabb
|
||||
}
|
||||
else
|
||||
{
|
||||
_xmin = (int)min(max((xmin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
|
||||
_xmax = (int)min(max((xmax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
|
||||
_ymin = (int)min(max((ymin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
|
||||
_ymax = (int)min(max((ymax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
_xmin = 0.0f;
|
||||
_xmax = (float)(N-1);
|
||||
_ymin = 0.0f;
|
||||
_ymax = (float)(N-1);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Diffuse kernel
|
||||
__global__ void DiffuseCubemapFwdKernel(DiffuseCubemapKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
int Npx = p.cubemap.dims[1];
|
||||
vec3f N = cube_to_dir(px, py, pz, Npx);
|
||||
|
||||
vec3f col(0);
|
||||
|
||||
for (int s = 0; s < p.cubemap.dims[0]; ++s)
|
||||
{
|
||||
for (int y = 0; y < Npx; ++y)
|
||||
{
|
||||
for (int x = 0; x < Npx; ++x)
|
||||
{
|
||||
vec3f L = cube_to_dir(x, y, s, Npx);
|
||||
float costheta = min(max(dot(N, L), 0.0f), 0.999f);
|
||||
float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
|
||||
col += p.cubemap.fetch3(x, y, s) * w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
p.out.store(px, py, pz, col);
|
||||
}
|
||||
|
||||
__global__ void DiffuseCubemapBwdKernel(DiffuseCubemapKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
int Npx = p.cubemap.dims[1];
|
||||
vec3f N = cube_to_dir(px, py, pz, Npx);
|
||||
vec3f grad = p.out.fetch3(px, py, pz);
|
||||
|
||||
for (int s = 0; s < p.cubemap.dims[0]; ++s)
|
||||
{
|
||||
for (int y = 0; y < Npx; ++y)
|
||||
{
|
||||
for (int x = 0; x < Npx; ++x)
|
||||
{
|
||||
vec3f L = cube_to_dir(x, y, s, Npx);
|
||||
float costheta = min(max(dot(N, L), 0.0f), 0.999f);
|
||||
float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
|
||||
atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
|
||||
atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
|
||||
atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// GGX splitsum kernel
|
||||
|
||||
__device__ inline float ndfGGX(const float alphaSqr, const float cosTheta)
|
||||
{
|
||||
float _cosTheta = clamp(cosTheta, 0.0, 1.0f);
|
||||
float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
|
||||
return alphaSqr / (d * d * M_PI);
|
||||
}
|
||||
|
||||
__global__ void SpecularBoundsKernel(SpecularBoundsKernelParams p)
|
||||
{
|
||||
int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
int Npx = p.gridSize.x;
|
||||
vec3f VNR = cube_to_dir(px, py, pz, Npx);
|
||||
|
||||
const int TILE_SIZE = 16;
|
||||
|
||||
// Brute force entire cubemap and compute bounds for the cone
|
||||
for (int s = 0; s < p.gridSize.z; ++s)
|
||||
{
|
||||
// Assume empty BBox
|
||||
int _min_x = p.gridSize.x - 1, _max_x = 0;
|
||||
int _min_y = p.gridSize.y - 1, _max_y = 0;
|
||||
|
||||
// For each (8x8) tile
|
||||
for (int tx = 0; tx < (p.gridSize.x + TILE_SIZE - 1) / TILE_SIZE; tx++)
|
||||
{
|
||||
for (int ty = 0; ty < (p.gridSize.y + TILE_SIZE - 1) / TILE_SIZE; ty++)
|
||||
{
|
||||
// Compute tile extents
|
||||
int tsx = tx * TILE_SIZE, tsy = ty * TILE_SIZE;
|
||||
int tex = min((tx + 1) * TILE_SIZE, p.gridSize.x), tey = min((ty + 1) * TILE_SIZE, p.gridSize.y);
|
||||
|
||||
// Use some blunt interval arithmetics to cull tiles
|
||||
vec3f L0 = cube_to_dir(tsx, tsy, s, Npx), L1 = cube_to_dir(tex, tsy, s, Npx);
|
||||
vec3f L2 = cube_to_dir(tsx, tey, s, Npx), L3 = cube_to_dir(tex, tey, s, Npx);
|
||||
|
||||
float minx = min(min(L0.x, L1.x), min(L2.x, L3.x)), maxx = max(max(L0.x, L1.x), max(L2.x, L3.x));
|
||||
float miny = min(min(L0.y, L1.y), min(L2.y, L3.y)), maxy = max(max(L0.y, L1.y), max(L2.y, L3.y));
|
||||
float minz = min(min(L0.z, L1.z), min(L2.z, L3.z)), maxz = max(max(L0.z, L1.z), max(L2.z, L3.z));
|
||||
|
||||
float maxdp = max(minx * VNR.x, maxx * VNR.x) + max(miny * VNR.y, maxy * VNR.y) + max(minz * VNR.z, maxz * VNR.z);
|
||||
if (maxdp >= p.costheta_cutoff)
|
||||
{
|
||||
// Test all pixels in tile.
|
||||
for (int y = tsy; y < tey; ++y)
|
||||
{
|
||||
for (int x = tsx; x < tex; ++x)
|
||||
{
|
||||
vec3f L = cube_to_dir(x, y, s, Npx);
|
||||
if (dot(L, VNR) >= p.costheta_cutoff)
|
||||
{
|
||||
_min_x = min(_min_x, x);
|
||||
_max_x = max(_max_x, x);
|
||||
_min_y = min(_min_y, y);
|
||||
_max_y = max(_max_y, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 0), _min_x);
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 1), _max_x);
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 2), _min_y);
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 3), _max_y);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void SpecularCubemapFwdKernel(SpecularCubemapKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
int Npx = p.cubemap.dims[1];
|
||||
vec3f VNR = cube_to_dir(px, py, pz, Npx);
|
||||
|
||||
float alpha = p.roughness * p.roughness;
|
||||
float alphaSqr = alpha * alpha;
|
||||
|
||||
float wsum = 0.0f;
|
||||
vec3f col(0);
|
||||
for (int s = 0; s < p.cubemap.dims[0]; ++s)
|
||||
{
|
||||
int xmin, xmax, ymin, ymax;
|
||||
xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
|
||||
xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
|
||||
ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
|
||||
ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
|
||||
|
||||
if (xmin <= xmax)
|
||||
{
|
||||
for (int y = ymin; y <= ymax; ++y)
|
||||
{
|
||||
for (int x = xmin; x <= xmax; ++x)
|
||||
{
|
||||
vec3f L = cube_to_dir(x, y, s, Npx);
|
||||
if (dot(L, VNR) >= p.costheta_cutoff)
|
||||
{
|
||||
vec3f H = safeNormalize(L + VNR);
|
||||
|
||||
float wiDotN = max(dot(L, VNR), 0.0f);
|
||||
float VNRDotH = max(dot(VNR, H), 0.0f);
|
||||
|
||||
float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
|
||||
col += p.cubemap.fetch3(x, y, s) * w;
|
||||
wsum += w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, 0), col.x);
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, 1), col.y);
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, 2), col.z);
|
||||
p.out.store(p.out._nhwcIndex(pz, py, px, 3), wsum);
|
||||
}
|
||||
|
||||
__global__ void SpecularCubemapBwdKernel(SpecularCubemapKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
int Npx = p.cubemap.dims[1];
|
||||
vec3f VNR = cube_to_dir(px, py, pz, Npx);
|
||||
|
||||
vec3f grad = p.out.fetch3(px, py, pz);
|
||||
|
||||
float alpha = p.roughness * p.roughness;
|
||||
float alphaSqr = alpha * alpha;
|
||||
|
||||
vec3f col(0);
|
||||
for (int s = 0; s < p.cubemap.dims[0]; ++s)
|
||||
{
|
||||
int xmin, xmax, ymin, ymax;
|
||||
xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
|
||||
xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
|
||||
ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
|
||||
ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
|
||||
|
||||
if (xmin <= xmax)
|
||||
{
|
||||
for (int y = ymin; y <= ymax; ++y)
|
||||
{
|
||||
for (int x = xmin; x <= xmax; ++x)
|
||||
{
|
||||
vec3f L = cube_to_dir(x, y, s, Npx);
|
||||
if (dot(L, VNR) >= p.costheta_cutoff)
|
||||
{
|
||||
vec3f H = safeNormalize(L + VNR);
|
||||
|
||||
float wiDotN = max(dot(L, VNR), 0.0f);
|
||||
float VNRDotH = max(dot(VNR, H), 0.0f);
|
||||
|
||||
float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
|
||||
|
||||
atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
|
||||
atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
|
||||
atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
38
render/renderutils/c_src/cubemap.h
Normal file
38
render/renderutils/c_src/cubemap.h
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct DiffuseCubemapKernelParams
|
||||
{
|
||||
Tensor cubemap;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
|
||||
struct SpecularCubemapKernelParams
|
||||
{
|
||||
Tensor cubemap;
|
||||
Tensor bounds;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
float costheta_cutoff;
|
||||
float roughness;
|
||||
};
|
||||
|
||||
struct SpecularBoundsKernelParams
|
||||
{
|
||||
float costheta_cutoff;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
210
render/renderutils/c_src/loss.cu
Normal file
210
render/renderutils/c_src/loss.cu
Normal file
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <cuda.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "loss.h"
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Utils
|
||||
|
||||
__device__ inline float bwdAbs(float x) { return x == 0.0f ? 0.0f : x < 0.0f ? -1.0f : 1.0f; }
|
||||
|
||||
__device__ float warpSum(float val) {
|
||||
for (int i = 1; i < 32; i *= 2)
|
||||
val += __shfl_xor_sync(0xFFFFFFFF, val, i);
|
||||
return val;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Tonemapping
|
||||
|
||||
__device__ inline float fwdSRGB(float x)
|
||||
{
|
||||
return x > 0.0031308f ? powf(max(x, 0.0031308f), 1.0f / 2.4f) * 1.055f - 0.055f : 12.92f * max(x, 0.0f);
|
||||
}
|
||||
|
||||
__device__ inline void bwdSRGB(float x, float &d_x, float d_out)
|
||||
{
|
||||
if (x > 0.0031308f)
|
||||
d_x += d_out * 0.439583f / powf(x, 0.583333f);
|
||||
else if (x > 0.0f)
|
||||
d_x += d_out * 12.92f;
|
||||
}
|
||||
|
||||
__device__ inline vec3f fwdTonemapLogSRGB(vec3f x)
|
||||
{
|
||||
return vec3f(fwdSRGB(logf(x.x + 1.0f)), fwdSRGB(logf(x.y + 1.0f)), fwdSRGB(logf(x.z + 1.0f)));
|
||||
}
|
||||
|
||||
__device__ inline void bwdTonemapLogSRGB(vec3f x, vec3f& d_x, vec3f d_out)
|
||||
{
|
||||
if (x.x > 0.0f && x.x < 65535.0f)
|
||||
{
|
||||
bwdSRGB(logf(x.x + 1.0f), d_x.x, d_out.x);
|
||||
d_x.x *= 1 / (x.x + 1.0f);
|
||||
}
|
||||
if (x.y > 0.0f && x.y < 65535.0f)
|
||||
{
|
||||
bwdSRGB(logf(x.y + 1.0f), d_x.y, d_out.y);
|
||||
d_x.y *= 1 / (x.y + 1.0f);
|
||||
}
|
||||
if (x.z > 0.0f && x.z < 65535.0f)
|
||||
{
|
||||
bwdSRGB(logf(x.z + 1.0f), d_x.z, d_out.z);
|
||||
d_x.z *= 1 / (x.z + 1.0f);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline float fwdRELMSE(float img, float target, float eps = 0.1f)
|
||||
{
|
||||
return (img - target) * (img - target) / (img * img + target * target + eps);
|
||||
}
|
||||
|
||||
__device__ inline void bwdRELMSE(float img, float target, float &d_img, float &d_target, float d_out, float eps = 0.1f)
|
||||
{
|
||||
float denom = (target * target + img * img + eps);
|
||||
d_img += d_out * 2 * (img - target) * (target * (target + img) + eps) / (denom * denom);
|
||||
d_target -= d_out * 2 * (img - target) * (img * (target + img) + eps) / (denom * denom);
|
||||
}
|
||||
|
||||
__device__ inline float fwdSMAPE(float img, float target, float eps=0.01f)
|
||||
{
|
||||
return abs(img - target) / (img + target + eps);
|
||||
}
|
||||
|
||||
__device__ inline void bwdSMAPE(float img, float target, float& d_img, float& d_target, float d_out, float eps = 0.01f)
|
||||
{
|
||||
float denom = (target + img + eps);
|
||||
d_img += d_out * bwdAbs(img - target) * (2 * target + eps) / (denom * denom);
|
||||
d_target -= d_out * bwdAbs(img - target) * (2 * img + eps) / (denom * denom);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Kernels
|
||||
|
||||
__global__ void imgLossFwdKernel(LossKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
|
||||
float floss = 0.0f;
|
||||
if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z)
|
||||
{
|
||||
vec3f img = p.img.fetch3(px, py, pz);
|
||||
vec3f target = p.target.fetch3(px, py, pz);
|
||||
|
||||
img = vec3f(clamp(img.x, 0.0f, 65535.0f), clamp(img.y, 0.0f, 65535.0f), clamp(img.z, 0.0f, 65535.0f));
|
||||
target = vec3f(clamp(target.x, 0.0f, 65535.0f), clamp(target.y, 0.0f, 65535.0f), clamp(target.z, 0.0f, 65535.0f));
|
||||
|
||||
if (p.tonemapper == TONEMAPPER_LOG_SRGB)
|
||||
{
|
||||
img = fwdTonemapLogSRGB(img);
|
||||
target = fwdTonemapLogSRGB(target);
|
||||
}
|
||||
|
||||
vec3f vloss(0);
|
||||
if (p.loss == LOSS_MSE)
|
||||
vloss = (img - target) * (img - target);
|
||||
else if (p.loss == LOSS_RELMSE)
|
||||
vloss = vec3f(fwdRELMSE(img.x, target.x), fwdRELMSE(img.y, target.y), fwdRELMSE(img.z, target.z));
|
||||
else if (p.loss == LOSS_SMAPE)
|
||||
vloss = vec3f(fwdSMAPE(img.x, target.x), fwdSMAPE(img.y, target.y), fwdSMAPE(img.z, target.z));
|
||||
else
|
||||
vloss = vec3f(abs(img.x - target.x), abs(img.y - target.y), abs(img.z - target.z));
|
||||
|
||||
floss = sum(vloss) / 3.0f;
|
||||
}
|
||||
|
||||
floss = warpSum(floss);
|
||||
|
||||
dim3 warpSize = getWarpSize(blockDim);
|
||||
if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z && threadIdx.x % warpSize.x == 0 && threadIdx.y % warpSize.y == 0 && threadIdx.z % warpSize.z == 0)
|
||||
p.out.store(px / warpSize.x, py / warpSize.y, pz / warpSize.z, floss);
|
||||
}
|
||||
|
||||
__global__ void imgLossBwdKernel(LossKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
dim3 warpSize = getWarpSize(blockDim);
|
||||
|
||||
vec3f _img = p.img.fetch3(px, py, pz);
|
||||
vec3f _target = p.target.fetch3(px, py, pz);
|
||||
float d_out = p.out.fetch1(px / warpSize.x, py / warpSize.y, pz / warpSize.z);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// FWD
|
||||
|
||||
vec3f img = _img, target = _target;
|
||||
if (p.tonemapper == TONEMAPPER_LOG_SRGB)
|
||||
{
|
||||
img = fwdTonemapLogSRGB(img);
|
||||
target = fwdTonemapLogSRGB(target);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// BWD
|
||||
|
||||
vec3f d_vloss = vec3f(d_out, d_out, d_out) / 3.0f;
|
||||
|
||||
vec3f d_img(0), d_target(0);
|
||||
if (p.loss == LOSS_MSE)
|
||||
{
|
||||
d_img = vec3f(d_vloss.x * 2 * (img.x - target.x), d_vloss.y * 2 * (img.y - target.y), d_vloss.x * 2 * (img.z - target.z));
|
||||
d_target = -d_img;
|
||||
}
|
||||
else if (p.loss == LOSS_RELMSE)
|
||||
{
|
||||
bwdRELMSE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
|
||||
bwdRELMSE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
|
||||
bwdRELMSE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
|
||||
}
|
||||
else if (p.loss == LOSS_SMAPE)
|
||||
{
|
||||
bwdSMAPE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
|
||||
bwdSMAPE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
|
||||
bwdSMAPE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
|
||||
}
|
||||
else
|
||||
{
|
||||
d_img = d_vloss * vec3f(bwdAbs(img.x - target.x), bwdAbs(img.y - target.y), bwdAbs(img.z - target.z));
|
||||
d_target = -d_img;
|
||||
}
|
||||
|
||||
|
||||
if (p.tonemapper == TONEMAPPER_LOG_SRGB)
|
||||
{
|
||||
vec3f d__img(0), d__target(0);
|
||||
bwdTonemapLogSRGB(_img, d__img, d_img);
|
||||
bwdTonemapLogSRGB(_target, d__target, d_target);
|
||||
d_img = d__img; d_target = d__target;
|
||||
}
|
||||
|
||||
if (_img.x <= 0.0f || _img.x >= 65535.0f) d_img.x = 0;
|
||||
if (_img.y <= 0.0f || _img.y >= 65535.0f) d_img.y = 0;
|
||||
if (_img.z <= 0.0f || _img.z >= 65535.0f) d_img.z = 0;
|
||||
if (_target.x <= 0.0f || _target.x >= 65535.0f) d_target.x = 0;
|
||||
if (_target.y <= 0.0f || _target.y >= 65535.0f) d_target.y = 0;
|
||||
if (_target.z <= 0.0f || _target.z >= 65535.0f) d_target.z = 0;
|
||||
|
||||
p.img.store_grad(px, py, pz, d_img);
|
||||
p.target.store_grad(px, py, pz, d_target);
|
||||
}
|
||||
38
render/renderutils/c_src/loss.h
Normal file
38
render/renderutils/c_src/loss.h
Normal file
@@ -0,0 +1,38 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
enum TonemapperType
|
||||
{
|
||||
TONEMAPPER_NONE = 0,
|
||||
TONEMAPPER_LOG_SRGB = 1
|
||||
};
|
||||
|
||||
enum LossType
|
||||
{
|
||||
LOSS_L1 = 0,
|
||||
LOSS_MSE = 1,
|
||||
LOSS_RELMSE = 2,
|
||||
LOSS_SMAPE = 3
|
||||
};
|
||||
|
||||
struct LossKernelParams
|
||||
{
|
||||
Tensor img;
|
||||
Tensor target;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
TonemapperType tonemapper;
|
||||
LossType loss;
|
||||
};
|
||||
94
render/renderutils/c_src/mesh.cu
Normal file
94
render/renderutils/c_src/mesh.cu
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include <cuda.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "mesh.h"
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Kernels
|
||||
|
||||
__global__ void xfmPointsFwdKernel(XfmKernelParams p)
|
||||
{
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
|
||||
|
||||
__shared__ float mtx[4][4];
|
||||
if (threadIdx.x < 16)
|
||||
mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
|
||||
__syncthreads();
|
||||
|
||||
if (px >= p.gridSize.x)
|
||||
return;
|
||||
|
||||
vec3f pos(
|
||||
p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
|
||||
p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
|
||||
p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
|
||||
);
|
||||
|
||||
if (p.isPoints)
|
||||
{
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0] + mtx[3][0]);
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1] + mtx[3][1]);
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2] + mtx[3][2]);
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 3, 0), pos.x * mtx[0][3] + pos.y * mtx[1][3] + pos.z * mtx[2][3] + mtx[3][3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 0, 0), pos.x * mtx[0][0] + pos.y * mtx[1][0] + pos.z * mtx[2][0]);
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 1, 0), pos.x * mtx[0][1] + pos.y * mtx[1][1] + pos.z * mtx[2][1]);
|
||||
p.out.store(p.out.nhwcIndex(pz, px, 2, 0), pos.x * mtx[0][2] + pos.y * mtx[1][2] + pos.z * mtx[2][2]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void xfmPointsBwdKernel(XfmKernelParams p)
|
||||
{
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int pz = blockIdx.z * blockDim.z + threadIdx.z;
|
||||
|
||||
__shared__ float mtx[4][4];
|
||||
if (threadIdx.x < 16)
|
||||
mtx[threadIdx.x % 4][threadIdx.x / 4] = p.matrix.fetch(p.matrix.nhwcIndex(pz, threadIdx.x / 4, threadIdx.x % 4, 0));
|
||||
__syncthreads();
|
||||
|
||||
if (px >= p.gridSize.x)
|
||||
return;
|
||||
|
||||
vec3f pos(
|
||||
p.points.fetch(p.points.nhwcIndex(pz, px, 0, 0)),
|
||||
p.points.fetch(p.points.nhwcIndex(pz, px, 1, 0)),
|
||||
p.points.fetch(p.points.nhwcIndex(pz, px, 2, 0))
|
||||
);
|
||||
|
||||
vec4f d_out(
|
||||
p.out.fetch(p.out.nhwcIndex(pz, px, 0, 0)),
|
||||
p.out.fetch(p.out.nhwcIndex(pz, px, 1, 0)),
|
||||
p.out.fetch(p.out.nhwcIndex(pz, px, 2, 0)),
|
||||
p.out.fetch(p.out.nhwcIndex(pz, px, 3, 0))
|
||||
);
|
||||
|
||||
if (p.isPoints)
|
||||
{
|
||||
p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2] + d_out.w * mtx[0][3]);
|
||||
p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2] + d_out.w * mtx[1][3]);
|
||||
p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2] + d_out.w * mtx[2][3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 0, 0), d_out.x * mtx[0][0] + d_out.y * mtx[0][1] + d_out.z * mtx[0][2]);
|
||||
p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 1, 0), d_out.x * mtx[1][0] + d_out.y * mtx[1][1] + d_out.z * mtx[1][2]);
|
||||
p.points.store_grad(p.points.nhwcIndexContinuous(pz, px, 2, 0), d_out.x * mtx[2][0] + d_out.y * mtx[2][1] + d_out.z * mtx[2][2]);
|
||||
}
|
||||
}
|
||||
23
render/renderutils/c_src/mesh.h
Normal file
23
render/renderutils/c_src/mesh.h
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct XfmKernelParams
|
||||
{
|
||||
bool isPoints;
|
||||
Tensor points;
|
||||
Tensor matrix;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
};
|
||||
182
render/renderutils/c_src/normal.cu
Normal file
182
render/renderutils/c_src/normal.cu
Normal file
@@ -0,0 +1,182 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#include "common.h"
|
||||
#include "normal.h"
|
||||
|
||||
#define NORMAL_THRESHOLD 0.1f
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Perturb shading normal by tangent frame
|
||||
|
||||
__device__ vec3f fwdPerturbNormal(const vec3f perturbed_nrm, const vec3f smooth_nrm, const vec3f smooth_tng, bool opengl)
|
||||
{
|
||||
vec3f _smooth_bitng = cross(smooth_tng, smooth_nrm);
|
||||
vec3f smooth_bitng = safeNormalize(_smooth_bitng);
|
||||
vec3f _shading_nrm = smooth_tng * perturbed_nrm.x + (opengl ? -1 : 1) * smooth_bitng * perturbed_nrm.y + smooth_nrm * max(perturbed_nrm.z, 0.0f);
|
||||
return safeNormalize(_shading_nrm);
|
||||
}
|
||||
|
||||
__device__ void bwdPerturbNormal(const vec3f perturbed_nrm, const vec3f smooth_nrm, const vec3f smooth_tng, vec3f &d_perturbed_nrm, vec3f &d_smooth_nrm, vec3f &d_smooth_tng, const vec3f d_out, bool opengl)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FWD
|
||||
vec3f _smooth_bitng = cross(smooth_tng, smooth_nrm);
|
||||
vec3f smooth_bitng = safeNormalize(_smooth_bitng);
|
||||
vec3f _shading_nrm = smooth_tng * perturbed_nrm.x + (opengl ? -1 : 1) * smooth_bitng * perturbed_nrm.y + smooth_nrm * max(perturbed_nrm.z, 0.0f);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// BWD
|
||||
vec3f d_shading_nrm(0);
|
||||
bwdSafeNormalize(_shading_nrm, d_shading_nrm, d_out);
|
||||
|
||||
vec3f d_smooth_bitng(0);
|
||||
|
||||
if (perturbed_nrm.z > 0.0f)
|
||||
{
|
||||
d_smooth_nrm += d_shading_nrm * perturbed_nrm.z;
|
||||
d_perturbed_nrm.z += sum(d_shading_nrm * smooth_nrm);
|
||||
}
|
||||
|
||||
d_smooth_bitng += (opengl ? -1 : 1) * d_shading_nrm * perturbed_nrm.y;
|
||||
d_perturbed_nrm.y += (opengl ? -1 : 1) * sum(d_shading_nrm * smooth_bitng);
|
||||
|
||||
d_smooth_tng += d_shading_nrm * perturbed_nrm.x;
|
||||
d_perturbed_nrm.x += sum(d_shading_nrm * smooth_tng);
|
||||
|
||||
vec3f d__smooth_bitng(0);
|
||||
bwdSafeNormalize(_smooth_bitng, d__smooth_bitng, d_smooth_bitng);
|
||||
|
||||
bwdCross(smooth_tng, smooth_nrm, d_smooth_tng, d_smooth_nrm, d__smooth_bitng);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
#define bent_nrm_eps 0.001f
|
||||
|
||||
__device__ vec3f fwdBendNormal(const vec3f view_vec, const vec3f smooth_nrm, const vec3f geom_nrm)
|
||||
{
|
||||
float dp = dot(view_vec, smooth_nrm);
|
||||
float t = clamp(dp / NORMAL_THRESHOLD, 0.0f, 1.0f);
|
||||
return geom_nrm * (1.0f - t) + smooth_nrm * t;
|
||||
}
|
||||
|
||||
__device__ void bwdBendNormal(const vec3f view_vec, const vec3f smooth_nrm, const vec3f geom_nrm, vec3f& d_view_vec, vec3f& d_smooth_nrm, vec3f& d_geom_nrm, const vec3f d_out)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// FWD
|
||||
float dp = dot(view_vec, smooth_nrm);
|
||||
float t = clamp(dp / NORMAL_THRESHOLD, 0.0f, 1.0f);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// BWD
|
||||
if (dp > NORMAL_THRESHOLD)
|
||||
d_smooth_nrm += d_out;
|
||||
else
|
||||
{
|
||||
// geom_nrm * (1.0f - t) + smooth_nrm * t;
|
||||
d_geom_nrm += d_out * (1.0f - t);
|
||||
d_smooth_nrm += d_out * t;
|
||||
float d_t = sum(d_out * (smooth_nrm - geom_nrm));
|
||||
|
||||
float d_dp = dp < 0.0f || dp > NORMAL_THRESHOLD ? 0.0f : d_t / NORMAL_THRESHOLD;
|
||||
|
||||
bwdDot(view_vec, smooth_nrm, d_view_vec, d_smooth_nrm, d_dp);
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// Kernels
|
||||
|
||||
__global__ void PrepareShadingNormalFwdKernel(PrepareShadingNormalKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f pos = p.pos.fetch3(px, py, pz);
|
||||
vec3f view_pos = p.view_pos.fetch3(px, py, pz);
|
||||
vec3f perturbed_nrm = p.perturbed_nrm.fetch3(px, py, pz);
|
||||
vec3f _smooth_nrm = p.smooth_nrm.fetch3(px, py, pz);
|
||||
vec3f _smooth_tng = p.smooth_tng.fetch3(px, py, pz);
|
||||
vec3f geom_nrm = p.geom_nrm.fetch3(px, py, pz);
|
||||
|
||||
vec3f smooth_nrm = safeNormalize(_smooth_nrm);
|
||||
vec3f smooth_tng = safeNormalize(_smooth_tng);
|
||||
vec3f view_vec = safeNormalize(view_pos - pos);
|
||||
vec3f shading_nrm = fwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, p.opengl);
|
||||
|
||||
vec3f res;
|
||||
if (p.two_sided_shading && dot(view_vec, geom_nrm) < 0.0f)
|
||||
res = fwdBendNormal(view_vec, -shading_nrm, -geom_nrm);
|
||||
else
|
||||
res = fwdBendNormal(view_vec, shading_nrm, geom_nrm);
|
||||
|
||||
p.out.store(px, py, pz, res);
|
||||
}
|
||||
|
||||
__global__ void PrepareShadingNormalBwdKernel(PrepareShadingNormalKernelParams p)
|
||||
{
|
||||
// Calculate pixel position.
|
||||
unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
unsigned int pz = blockIdx.z;
|
||||
if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
|
||||
return;
|
||||
|
||||
vec3f pos = p.pos.fetch3(px, py, pz);
|
||||
vec3f view_pos = p.view_pos.fetch3(px, py, pz);
|
||||
vec3f perturbed_nrm = p.perturbed_nrm.fetch3(px, py, pz);
|
||||
vec3f _smooth_nrm = p.smooth_nrm.fetch3(px, py, pz);
|
||||
vec3f _smooth_tng = p.smooth_tng.fetch3(px, py, pz);
|
||||
vec3f geom_nrm = p.geom_nrm.fetch3(px, py, pz);
|
||||
vec3f d_out = p.out.fetch3(px, py, pz);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FWD
|
||||
|
||||
vec3f smooth_nrm = safeNormalize(_smooth_nrm);
|
||||
vec3f smooth_tng = safeNormalize(_smooth_tng);
|
||||
vec3f _view_vec = view_pos - pos;
|
||||
vec3f view_vec = safeNormalize(view_pos - pos);
|
||||
|
||||
vec3f shading_nrm = fwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, p.opengl);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// BWD
|
||||
|
||||
vec3f d_view_vec(0), d_shading_nrm(0), d_geom_nrm(0);
|
||||
if (p.two_sided_shading && dot(view_vec, geom_nrm) < 0.0f)
|
||||
{
|
||||
bwdBendNormal(view_vec, -shading_nrm, -geom_nrm, d_view_vec, d_shading_nrm, d_geom_nrm, d_out);
|
||||
d_shading_nrm = -d_shading_nrm;
|
||||
d_geom_nrm = -d_geom_nrm;
|
||||
}
|
||||
else
|
||||
bwdBendNormal(view_vec, shading_nrm, geom_nrm, d_view_vec, d_shading_nrm, d_geom_nrm, d_out);
|
||||
|
||||
vec3f d_perturbed_nrm(0), d_smooth_nrm(0), d_smooth_tng(0);
|
||||
bwdPerturbNormal(perturbed_nrm, smooth_nrm, smooth_tng, d_perturbed_nrm, d_smooth_nrm, d_smooth_tng, d_shading_nrm, p.opengl);
|
||||
|
||||
vec3f d__view_vec(0), d__smooth_nrm(0), d__smooth_tng(0);
|
||||
bwdSafeNormalize(_view_vec, d__view_vec, d_view_vec);
|
||||
bwdSafeNormalize(_smooth_nrm, d__smooth_nrm, d_smooth_nrm);
|
||||
bwdSafeNormalize(_smooth_tng, d__smooth_tng, d_smooth_tng);
|
||||
|
||||
p.pos.store_grad(px, py, pz, -d__view_vec);
|
||||
p.view_pos.store_grad(px, py, pz, d__view_vec);
|
||||
p.perturbed_nrm.store_grad(px, py, pz, d_perturbed_nrm);
|
||||
p.smooth_nrm.store_grad(px, py, pz, d__smooth_nrm);
|
||||
p.smooth_tng.store_grad(px, py, pz, d__smooth_tng);
|
||||
p.geom_nrm.store_grad(px, py, pz, d_geom_nrm);
|
||||
}
|
||||
27
render/renderutils/c_src/normal.h
Normal file
27
render/renderutils/c_src/normal.h
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
struct PrepareShadingNormalKernelParams
|
||||
{
|
||||
Tensor pos;
|
||||
Tensor view_pos;
|
||||
Tensor perturbed_nrm;
|
||||
Tensor smooth_nrm;
|
||||
Tensor smooth_tng;
|
||||
Tensor geom_nrm;
|
||||
Tensor out;
|
||||
dim3 gridSize;
|
||||
bool two_sided_shading, opengl;
|
||||
};
|
||||
92
render/renderutils/c_src/tensor.h
Normal file
92
render/renderutils/c_src/tensor.h
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#if defined(__CUDACC__) && defined(BFLOAT16)
|
||||
#include <cuda_bf16.h> // bfloat16 is float32 compatible with less mantissa bits
|
||||
#endif
|
||||
|
||||
//---------------------------------------------------------------------------------
|
||||
// CUDA-side Tensor class for in/out parameter parsing. Can be float32 or bfloat16
|
||||
|
||||
struct Tensor
|
||||
{
|
||||
void* val;
|
||||
void* d_val;
|
||||
int dims[4], _dims[4];
|
||||
int strides[4];
|
||||
bool fp16;
|
||||
|
||||
#if defined(__CUDA__) && !defined(__CUDA_ARCH__)
|
||||
Tensor() : val(nullptr), d_val(nullptr), fp16(true), dims{ 0, 0, 0, 0 }, _dims{ 0, 0, 0, 0 }, strides{ 0, 0, 0, 0 } {}
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
// Helpers to index and read/write a single element
|
||||
__device__ inline int _nhwcIndex(int n, int h, int w, int c) const { return n * strides[0] + h * strides[1] + w * strides[2] + c * strides[3]; }
|
||||
__device__ inline int nhwcIndex(int n, int h, int w, int c) const { return (dims[0] == 1 ? 0 : n * strides[0]) + (dims[1] == 1 ? 0 : h * strides[1]) + (dims[2] == 1 ? 0 : w * strides[2]) + (dims[3] == 1 ? 0 : c * strides[3]); }
|
||||
__device__ inline int nhwcIndexContinuous(int n, int h, int w, int c) const { return ((n * _dims[1] + h) * _dims[2] + w) * _dims[3] + c; }
|
||||
#ifdef BFLOAT16
|
||||
__device__ inline float fetch(unsigned int idx) const { return fp16 ? __bfloat162float(((__nv_bfloat16*)val)[idx]) : ((float*)val)[idx]; }
|
||||
__device__ inline void store(unsigned int idx, float _val) { if (fp16) ((__nv_bfloat16*)val)[idx] = __float2bfloat16(_val); else ((float*)val)[idx] = _val; }
|
||||
__device__ inline void store_grad(unsigned int idx, float _val) { if (fp16) ((__nv_bfloat16*)d_val)[idx] = __float2bfloat16(_val); else ((float*)d_val)[idx] = _val; }
|
||||
#else
|
||||
__device__ inline float fetch(unsigned int idx) const { return ((float*)val)[idx]; }
|
||||
__device__ inline void store(unsigned int idx, float _val) { ((float*)val)[idx] = _val; }
|
||||
__device__ inline void store_grad(unsigned int idx, float _val) { ((float*)d_val)[idx] = _val; }
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Fetch, use broadcasting for tensor dimensions of size 1
|
||||
__device__ inline float fetch1(unsigned int x, unsigned int y, unsigned int z) const
|
||||
{
|
||||
return fetch(nhwcIndex(z, y, x, 0));
|
||||
}
|
||||
|
||||
__device__ inline vec3f fetch3(unsigned int x, unsigned int y, unsigned int z) const
|
||||
{
|
||||
return vec3f(
|
||||
fetch(nhwcIndex(z, y, x, 0)),
|
||||
fetch(nhwcIndex(z, y, x, 1)),
|
||||
fetch(nhwcIndex(z, y, x, 2))
|
||||
);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Store, no broadcasting here. Assume we output full res gradient and then reduce using torch.sum outside
|
||||
__device__ inline void store(unsigned int x, unsigned int y, unsigned int z, float _val)
|
||||
{
|
||||
store(_nhwcIndex(z, y, x, 0), _val);
|
||||
}
|
||||
|
||||
__device__ inline void store(unsigned int x, unsigned int y, unsigned int z, vec3f _val)
|
||||
{
|
||||
store(_nhwcIndex(z, y, x, 0), _val.x);
|
||||
store(_nhwcIndex(z, y, x, 1), _val.y);
|
||||
store(_nhwcIndex(z, y, x, 2), _val.z);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Store gradient , no broadcasting here. Assume we output full res gradient and then reduce using torch.sum outside
|
||||
__device__ inline void store_grad(unsigned int x, unsigned int y, unsigned int z, float _val)
|
||||
{
|
||||
store_grad(nhwcIndexContinuous(z, y, x, 0), _val);
|
||||
}
|
||||
|
||||
__device__ inline void store_grad(unsigned int x, unsigned int y, unsigned int z, vec3f _val)
|
||||
{
|
||||
store_grad(nhwcIndexContinuous(z, y, x, 0), _val.x);
|
||||
store_grad(nhwcIndexContinuous(z, y, x, 1), _val.y);
|
||||
store_grad(nhwcIndexContinuous(z, y, x, 2), _val.z);
|
||||
}
|
||||
#endif
|
||||
|
||||
};
|
||||
1062
render/renderutils/c_src/torch_bindings.cpp
Normal file
1062
render/renderutils/c_src/torch_bindings.cpp
Normal file
File diff suppressed because it is too large
Load Diff
109
render/renderutils/c_src/vec3f.h
Normal file
109
render/renderutils/c_src/vec3f.h
Normal file
@@ -0,0 +1,109 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
struct vec3f
|
||||
{
|
||||
float x, y, z;
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ vec3f() { }
|
||||
__device__ vec3f(float v) { x = v; y = v; z = v; }
|
||||
__device__ vec3f(float _x, float _y, float _z) { x = _x; y = _y; z = _z; }
|
||||
__device__ vec3f(float3 v) { x = v.x; y = v.y; z = v.z; }
|
||||
|
||||
__device__ inline vec3f& operator+=(const vec3f& b) { x += b.x; y += b.y; z += b.z; return *this; }
|
||||
__device__ inline vec3f& operator-=(const vec3f& b) { x -= b.x; y -= b.y; z -= b.z; return *this; }
|
||||
__device__ inline vec3f& operator*=(const vec3f& b) { x *= b.x; y *= b.y; z *= b.z; return *this; }
|
||||
__device__ inline vec3f& operator/=(const vec3f& b) { x /= b.x; y /= b.y; z /= b.z; return *this; }
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ static inline vec3f operator+(const vec3f& a, const vec3f& b) { return vec3f(a.x + b.x, a.y + b.y, a.z + b.z); }
|
||||
__device__ static inline vec3f operator-(const vec3f& a, const vec3f& b) { return vec3f(a.x - b.x, a.y - b.y, a.z - b.z); }
|
||||
__device__ static inline vec3f operator*(const vec3f& a, const vec3f& b) { return vec3f(a.x * b.x, a.y * b.y, a.z * b.z); }
|
||||
__device__ static inline vec3f operator/(const vec3f& a, const vec3f& b) { return vec3f(a.x / b.x, a.y / b.y, a.z / b.z); }
|
||||
__device__ static inline vec3f operator-(const vec3f& a) { return vec3f(-a.x, -a.y, -a.z); }
|
||||
|
||||
__device__ static inline float sum(vec3f a)
|
||||
{
|
||||
return a.x + a.y + a.z;
|
||||
}
|
||||
|
||||
__device__ static inline vec3f cross(vec3f a, vec3f b)
|
||||
{
|
||||
vec3f out;
|
||||
out.x = a.y * b.z - a.z * b.y;
|
||||
out.y = a.z * b.x - a.x * b.z;
|
||||
out.z = a.x * b.y - a.y * b.x;
|
||||
return out;
|
||||
}
|
||||
|
||||
__device__ static inline void bwdCross(vec3f a, vec3f b, vec3f &d_a, vec3f &d_b, vec3f d_out)
|
||||
{
|
||||
d_a.x += d_out.z * b.y - d_out.y * b.z;
|
||||
d_a.y += d_out.x * b.z - d_out.z * b.x;
|
||||
d_a.z += d_out.y * b.x - d_out.x * b.y;
|
||||
|
||||
d_b.x += d_out.y * a.z - d_out.z * a.y;
|
||||
d_b.y += d_out.z * a.x - d_out.x * a.z;
|
||||
d_b.z += d_out.x * a.y - d_out.y * a.x;
|
||||
}
|
||||
|
||||
__device__ static inline float dot(vec3f a, vec3f b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y + a.z * b.z;
|
||||
}
|
||||
|
||||
__device__ static inline void bwdDot(vec3f a, vec3f b, vec3f& d_a, vec3f& d_b, float d_out)
|
||||
{
|
||||
d_a.x += d_out * b.x; d_a.y += d_out * b.y; d_a.z += d_out * b.z;
|
||||
d_b.x += d_out * a.x; d_b.y += d_out * a.y; d_b.z += d_out * a.z;
|
||||
}
|
||||
|
||||
__device__ static inline vec3f reflect(vec3f x, vec3f n)
|
||||
{
|
||||
return n * 2.0f * dot(n, x) - x;
|
||||
}
|
||||
|
||||
__device__ static inline void bwdReflect(vec3f x, vec3f n, vec3f& d_x, vec3f& d_n, const vec3f d_out)
|
||||
{
|
||||
d_x.x += d_out.x * (2 * n.x * n.x - 1) + d_out.y * (2 * n.x * n.y) + d_out.z * (2 * n.x * n.z);
|
||||
d_x.y += d_out.x * (2 * n.x * n.y) + d_out.y * (2 * n.y * n.y - 1) + d_out.z * (2 * n.y * n.z);
|
||||
d_x.z += d_out.x * (2 * n.x * n.z) + d_out.y * (2 * n.y * n.z) + d_out.z * (2 * n.z * n.z - 1);
|
||||
|
||||
d_n.x += d_out.x * (2 * (2 * n.x * x.x + n.y * x.y + n.z * x.z)) + d_out.y * (2 * n.y * x.x) + d_out.z * (2 * n.z * x.x);
|
||||
d_n.y += d_out.x * (2 * n.x * x.y) + d_out.y * (2 * (n.x * x.x + 2 * n.y * x.y + n.z * x.z)) + d_out.z * (2 * n.z * x.y);
|
||||
d_n.z += d_out.x * (2 * n.x * x.z) + d_out.y * (2 * n.y * x.z) + d_out.z * (2 * (n.x * x.x + n.y * x.y + 2 * n.z * x.z));
|
||||
}
|
||||
|
||||
__device__ static inline vec3f safeNormalize(vec3f v)
|
||||
{
|
||||
float l = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
|
||||
return l > 0.0f ? (v / l) : vec3f(0.0f);
|
||||
}
|
||||
|
||||
__device__ static inline void bwdSafeNormalize(const vec3f v, vec3f& d_v, const vec3f d_out)
|
||||
{
|
||||
|
||||
float l = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
|
||||
if (l > 0.0f)
|
||||
{
|
||||
float fac = 1.0 / powf(v.x * v.x + v.y * v.y + v.z * v.z, 1.5f);
|
||||
d_v.x += (d_out.x * (v.y * v.y + v.z * v.z) - d_out.y * (v.x * v.y) - d_out.z * (v.x * v.z)) * fac;
|
||||
d_v.y += (d_out.y * (v.x * v.x + v.z * v.z) - d_out.x * (v.y * v.x) - d_out.z * (v.y * v.z)) * fac;
|
||||
d_v.z += (d_out.z * (v.x * v.x + v.y * v.y) - d_out.x * (v.z * v.x) - d_out.y * (v.z * v.y)) * fac;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
25
render/renderutils/c_src/vec4f.h
Normal file
25
render/renderutils/c_src/vec4f.h
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
struct vec4f
|
||||
{
|
||||
float x, y, z, w;
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ vec4f() { }
|
||||
__device__ vec4f(float v) { x = v; y = v; z = v; w = v; }
|
||||
__device__ vec4f(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; }
|
||||
__device__ vec4f(float4 v) { x = v.x; y = v.y; z = v.z; w = v.w; }
|
||||
#endif
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user