diff --git a/invokeai/backend/stable_diffusion/diffusion_backend.py b/invokeai/backend/stable_diffusion/diffusion_backend.py index 4191db734f9..eaa0600bc82 100644 --- a/invokeai/backend/stable_diffusion/diffusion_backend.py +++ b/invokeai/backend/stable_diffusion/diffusion_backend.py @@ -99,11 +99,15 @@ def combine_noise_preds(ctx: DenoiseContext) -> torch.Tensor: guidance_scale = ctx.inputs.conditioning_data.guidance_scale if isinstance(guidance_scale, list): guidance_scale = guidance_scale[ctx.step_index] - - # Note: Although this `torch.lerp(...)` line is logically equivalent to the current CFG line, it seems to result - # in slightly different outputs. It is suspected that this is caused by small precision differences. - # return torch.lerp(ctx.negative_noise_pred, ctx.positive_noise_pred, guidance_scale) - return ctx.negative_noise_pred + guidance_scale * (ctx.positive_noise_pred - ctx.negative_noise_pred) + neg = ctx.negative_noise_pred + pos = ctx.positive_noise_pred + gs = guidance_scale + # Try to ensure in-place operation when possible to reduce allocations/memory pressure + # and ensure all math is vectorized for maximal torch efficiency. + # Many implementations do: out = neg.clone(); out.add_(gs, pos - neg) + # But for max speed on large tensors, we can use torch.add with 'alpha' parameter. + # This avoids creating an intermediate tensor for (pos - neg). + return torch.add(neg, pos - neg, alpha=gs) def run_unet(self, ctx: DenoiseContext, ext_manager: ExtensionsManager, conditioning_mode: ConditioningMode): sample = ctx.latent_model_input