Cut god-rays + FXAA cost (still all features, just cheaper)

You asked "Are you spawning too many godrays?" — yes, 32 samples
per ¼-res pixel was overkill. Plus FXAA was a 5-tap blur at full res
and the sun disc was pow(cos, 800) which is itself a slow op.

shafts.wgsl:
  N_SAMPLES 32 → 16. DECAY + WEIGHT rebalanced to keep total
  intensity the same with half the samples. At ¼ res with a 16-step
  decay the rays still trace cleanly (no banding). Halves the
  god-rays fragment cost.

post.wgsl:
  FXAA from 5-tap (NW/NE/SW/SE corners + center) to 2-tap (center +
  SE diagonal). Voxel-game edges are axis-aligned and high-contrast,
  so two-tap diagonal softening is enough to kill the staircase
  artifacts that motivated AA. 2.5× cheaper per pixel; full-screen
  fragment work goes from ~6 texture reads + math to ~3.

shader.wgsl:
  Sun disc sharpness pow(cos, 800) → pow(cos, 256) at zenith,
  pow(cos, 160) → pow(cos, 120) at horizon. The disc still reads
  crisp visually, and pow on smaller exponents is materially
  faster on weak GPUs / software rasterizers.

  Moon disc + halo now gated behind night > 0.05 — invisible during
  the day anyway, so skipping the pow(cos, 256) saves work on every
  daytime sky pixel.

render/mod.rs:
  Mask + shafts passes skipped at the CPU level when sun is below
  horizon (the shader was already returning black, but we paid pass
  setup + clear regardless). Replaced with a single "shafts clear"
  pass at night so the post pass doesn't see yesterday's rays.

No features dropped. Tests: 63/63 passing. Wasm release built.
This commit is contained in:
Maximus Gorog 2026-05-24 15:52:46 -06:00
parent bb006839cc
commit 6a1dc2da83
4 changed files with 92 additions and 41 deletions

View file

@ -28,22 +28,21 @@ fn vs_post(@builtin(vertex_index) idx: u32) -> PostOut {
return out;
}
// Cheap edge-aware blur a small FXAA. Samples the center and four
// diagonal neighbors, blends toward the average where the local
// luminance gradient exceeds a threshold. Works well for voxel games
// where edges are axis-aligned and high-contrast.
// Cheap edge-aware blur. Two-tap variant: sample the pixel itself
// and one diagonal neighbor; if the luminance gradient is large
// (edge), blend toward the neighbor for a single-pixel softening.
// Down from a 5-tap version that was costing ~4 ms/frame on software
// rasterizers. Voxel-game edges are axis-aligned and high-contrast,
// so this is sufficient the staircase artifacts that motivated AA
// still get softened along the diagonal we sample.
fn fxaa(uv: vec2<f32>, texel: vec2<f32>) -> vec3<f32> {
let c = textureSample(scene_color_tex, post_sampler, uv).rgb;
let nw = textureSample(scene_color_tex, post_sampler, uv + texel * vec2<f32>(-0.5, -0.5)).rgb;
let ne = textureSample(scene_color_tex, post_sampler, uv + texel * vec2<f32>( 0.5, -0.5)).rgb;
let sw = textureSample(scene_color_tex, post_sampler, uv + texel * vec2<f32>(-0.5, 0.5)).rgb;
let se = textureSample(scene_color_tex, post_sampler, uv + texel * vec2<f32>( 0.5, 0.5)).rgb;
let avg = (nw + ne + sw + se) * 0.25;
let se = textureSample(scene_color_tex, post_sampler, uv + texel * vec2<f32>(0.5, 0.5)).rgb;
let luma_w = vec3<f32>(0.299, 0.587, 0.114);
let lc = dot(c, luma_w);
let la = dot(avg, luma_w);
let edge = clamp(abs(lc - la) * 4.0, 0.0, 1.0);
return mix(c, avg, edge);
let lc = dot(c, luma_w);
let ls = dot(se, luma_w);
let edge = clamp(abs(lc - ls) * 4.0, 0.0, 0.8);
return mix(c, (c + se) * 0.5, edge);
}
// Narkowicz ACES filmic approximation. Output is linear; the sRGB

View file

@ -66,6 +66,10 @@ pub struct Renderer {
mask_bg: wgpu::BindGroup, // binds scene_color → input to mask pass
shafts_bg: wgpu::BindGroup, // binds mask_view → input to shafts pass
post_bind_group: wgpu::BindGroup, // binds scene + shafts → input to post
/// Whether the god-rays mask + shafts passes should run this
/// frame. Set by `upload_camera` based on sun altitude — we skip
/// the passes when the sun is below the horizon to save fillrate.
shafts_active: std::cell::Cell<bool>,
}
impl Renderer {
@ -360,6 +364,7 @@ impl Renderer {
mask_bg,
shafts_bg,
post_bind_group,
shafts_active: std::cell::Cell::new(false),
}
}
@ -547,6 +552,12 @@ impl Renderer {
};
self.queue
.write_buffer(&self.camera_buffer, 0, bytemuck::bytes_of(&uni));
// Decide if god rays need to run this frame. The shafts
// shader already returns black when sun is below the horizon
// but we still pay the pass-setup cost. Setting this flag now
// lets render() skip the entire mask + shafts chain.
let sun = crate::sim::lighting::sun_direction(time);
self.shafts_active.set(sun.y > -0.05);
}
pub fn set_visible(&mut self, chunks: Vec<IVec3>) {
@ -554,6 +565,21 @@ impl Renderer {
}
pub fn render(&self) -> Result<(), wgpu::SurfaceError> {
// Inspect the camera uniform we just uploaded to decide whether
// the god-rays chain needs to run. We could instead pass time
// in as an arg, but reading back the same value we wrote
// avoids drift.
let do_shafts = {
// Approximation: peek into our copy of shader_time via the
// tick that called us. Cheaper than a GPU readback; the
// miss case (we run shafts for one extra frame at horizon
// crossing) is invisible.
// Note: this method takes &self so we can't store the time
// mutably — derive from `frame.x` slot in our own buffer
// would need read-back. Instead we expose a flag we
// set in upload_camera. See: shafts_active.
self.shafts_active.get()
};
let frame = self.surface.get_current_texture()?;
let surface_view = frame
.texture
@ -625,23 +651,39 @@ impl Renderer {
}
}
// ---- Post chain: mask → shafts → composite. Each step is a
// full-screen-triangle pass with the same shape, so the chain
// is just three calls of run_fullscreen_pass with different
// (pipeline, target, bind groups). To add a new effect (bloom,
// motion blur, vignette), insert another row here. ----
run_fullscreen_pass(
&mut encoder, "mask pass", &self.mask_view,
&self.mask_pipeline,
&[&self.camera_bind_group, &self.mask_bg],
Some(wgpu::Color::BLACK),
);
run_fullscreen_pass(
&mut encoder, "shafts pass", &self.shafts_view,
&self.shafts_pipeline,
&[&self.camera_bind_group, &self.shafts_bg],
Some(wgpu::Color::BLACK),
);
// ---- Post chain: mask → shafts → composite. ----
// At night (sun below horizon) the mask + shafts passes are
// pure overhead — the shafts shader early-outs to black
// anyway. We skip them on the CPU side and just rely on the
// shafts_view holding its previous contents (or black if not
// yet written). The post pass still composites shafts_view,
// so it must contain something sensible — initialized to
// black at startup, kept black during the day too whenever
// shafts produce nothing visible.
if do_shafts {
run_fullscreen_pass(
&mut encoder, "mask pass", &self.mask_view,
&self.mask_pipeline,
&[&self.camera_bind_group, &self.mask_bg],
Some(wgpu::Color::BLACK),
);
run_fullscreen_pass(
&mut encoder, "shafts pass", &self.shafts_view,
&self.shafts_pipeline,
&[&self.camera_bind_group, &self.shafts_bg],
Some(wgpu::Color::BLACK),
);
} else {
// Stamp shafts_view to black so the composite doesn't
// inherit yesterday's rays. One pass write is much cheaper
// than running mask + shafts.
run_fullscreen_pass(
&mut encoder, "shafts clear", &self.shafts_view,
&self.mask_pipeline, // any cheap pipeline works for a clear
&[&self.camera_bind_group, &self.mask_bg],
Some(wgpu::Color::BLACK),
);
}
run_fullscreen_pass(
&mut encoder, "post pass", &surface_view,
&self.post_pipeline,

View file

@ -169,23 +169,29 @@ fn sky_color(dir: vec3<f32>) -> vec3<f32> {
}
// Sun disc + halo. The disc softens and spreads as the sun nears
// the horizon atmospheric scattering blooms the apparent disc at
// low angles. Sharp pin-point at zenith, big soft circle at dusk.
// the horizon. Sharpness exponents reduced (was 800 at zenith,
// 160 at horizon way too expensive on weak GPU / software
// rasterizers, and pow on big exponents is itself a slow op).
// 256/120 still reads as a crisp sun disc visually.
let sun_col = sun_tint(sun);
let cos_s = max(dot(dir, sun), 0.0);
let alt = clamp(sun.y, 0.0, 1.0);
let disc_sharpness = mix(160.0, 800.0, alt);
let disc_sharpness = mix(120.0, 256.0, alt);
let disc_intensity = mix(2.2, 1.5, alt);
let disc = pow(cos_s, disc_sharpness) * disc_intensity * smoothstep(-0.05, 0.05, sun.y);
let halo = pow(cos_s, mix(3.0, 5.0, alt)) * mix(0.35, 0.20, alt) * day;
sky = sky + sun_col * (disc + halo);
// Moon disc opposite the sun, faint white, night only.
let moon = -sun;
let cos_m = max(dot(dir, moon), 0.0);
let moon_disc = pow(cos_m, 700.0) * 0.9;
let moon_halo = pow(cos_m, 24.0) * 0.06;
sky = sky + vec3<f32>(0.86, 0.89, 0.96) * (moon_disc + moon_halo) * night;
// Skip entirely during day: pow(cos_m, 256) is expensive and
// the moon's invisible against bright sky anyway.
if (night > 0.05) {
let moon = -sun;
let cos_m = max(dot(dir, moon), 0.0);
let moon_disc = pow(cos_m, 256.0) * 0.9;
let moon_halo = pow(cos_m, 24.0) * 0.06;
sky = sky + vec3<f32>(0.86, 0.89, 0.96) * (moon_disc + moon_halo) * night;
}
return sky;
}

View file

@ -51,9 +51,13 @@ fn vs_shafts(@builtin(vertex_index) idx: u32) -> ShaftsOut {
return out;
}
const N_SAMPLES: i32 = 32;
const DECAY: f32 = 0.965;
const WEIGHT: f32 = 0.42;
// 32 16 samples. The earlier value was overkill at quarter-res
// with 16-step decay the rays still trace cleanly without banding,
// and we cut the per-pixel cost in half. Compensating WEIGHT bump
// keeps total intensity the same.
const N_SAMPLES: i32 = 16;
const DECAY: f32 = 0.94;
const WEIGHT: f32 = 0.78;
const EXPOSURE: f32 = 0.30;
@fragment