Skip to content

Commit ab60d7a

Browse files
fix: guidance_scale 1.0 everywhere, ace-understand auto-detect
guidance_scale: - Replaced all remaining 7.0 with 1.0 (README, examples, tests). - Fixed debug-dit-cossim.py CFG threshold: > 1.0 to match C++ (was > 0, causing Python to run 2x forward + APG while C++ skipped CFG). ace-understand: - Reads acestep.is_turbo from DiT GGUF and writes matching defaults (steps/shift/guidance) so the output JSON works as-is with dit-vae. The rest is up to the JSON, dit-vae just does what it says. examples/understand-roundtrip.sh: - Roundtrip: audio -> ace-understand -> dit-vae -> 4 WAV variations. Uses --src-audio with audio_cover_strength 0.04 (2 out of 50 SFT steps use the source as context). High values stay close to the source, low values let the model diverge.
1 parent 627cf43 commit ab60d7a

9 files changed

Lines changed: 71 additions & 11 deletions

File tree

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ cat > /tmp/repaint.json << 'EOF'
238238
"repainting_start": 10.0,
239239
"repainting_end": 25.0,
240240
"inference_steps": 50,
241-
"guidance_scale": 7.0,
241+
"guidance_scale": 1.0,
242242
"shift": 1.0
243243
}
244244
EOF
@@ -263,7 +263,7 @@ cat > /tmp/lego.json << 'EOF'
263263
"lyrics": "[Instrumental]",
264264
"lego": "guitar",
265265
"inference_steps": 50,
266-
"guidance_scale": 7.0,
266+
"guidance_scale": 1.0,
267267
"shift": 1.0
268268
}
269269
EOF
@@ -383,7 +383,7 @@ Valid names: `vocals`, `backing_vocals`, `drums`, `bass`, `guitar`,
383383
When set, passes the source audio to the DiT as context and builds the
384384
instruction `"Generate the {TRACK} track based on the audio context:"`.
385385
`audio_cover_strength` is forced to 1.0 (all steps see the source audio).
386-
Use `inference_steps=50`, `guidance_scale=7.0`, `shift=1.0` for base model.
386+
Use `inference_steps=50`, `guidance_scale=1.0`, `shift=1.0` for base model.
387387

388388
### LM sampling (ace-qwen3)
389389

examples/ace-understand.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
# Roundtrip: audio -> understand -> SFT DiT -> 4 WAV variations
3+
#
4+
# Usage: ./understand-roundtrip.sh input.wav (or input.mp3)
5+
#
6+
# understand:
7+
# input -> ace-understand.json (audio codes + metadata)
8+
#
9+
# dit-vae:
10+
# ace-understand.json -> output0.wav .. output3.wav
11+
12+
set -eu
13+
14+
if [ $# -lt 1 ]; then
15+
echo "Usage: $0 <input.wav|input.mp3>"
16+
exit 1
17+
fi
18+
19+
input="$1"
20+
21+
../build/ace-understand \
22+
--src-audio "$input" \
23+
--dit ../models/acestep-v15-sft-Q8_0.gguf \
24+
--vae ../models/vae-BF16.gguf \
25+
--model ../models/acestep-5Hz-lm-4B-Q8_0.gguf \
26+
-o ace-understand.json
27+
28+
sed -i \
29+
-e 's/"audio_cover_strength": *[0-9.]*/"audio_cover_strength": 0.04/' \
30+
ace-understand.json
31+
32+
../build/dit-vae \
33+
--src-audio "$input" \
34+
--request ace-understand.json \
35+
--text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
36+
--dit ../models/acestep-v15-sft-Q8_0.gguf \
37+
--vae ../models/vae-BF16.gguf \
38+
--batch 4 \
39+
--wav

examples/dit-only.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@
77
"timesignature": "4",
88
"vocal_language": "en",
99
"inference_steps": 8,
10+
"guidance_scale": 1.0,
1011
"shift": 3.0
1112
}

examples/lego.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
"lyrics": "[Instrumental]",
44
"lego": "guitar",
55
"inference_steps": 50,
6-
"guidance_scale": 7.0,
6+
"guidance_scale": 1.0,
77
"shift": 1.0
88
}

examples/partial.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@
44
"duration": 200,
55
"vocal_language": "fr",
66
"inference_steps": 8,
7+
"guidance_scale": 1.0,
78
"shift": 3.0
8-
}
9+
}

examples/simple-sft.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"caption": "Upbeat pop rock anthem with driving electric guitars, punchy drums, catchy vocal hooks, and a singalong chorus",
3+
"vocal_language": "fr"
34
"inference_steps": 50,
45
"guidance_scale": 1.0,
56
"shift": 1.0,
6-
"vocal_language": "fr"
77
}

examples/simple.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"caption": "Upbeat pop rock anthem with driving electric guitars, punchy drums, catchy vocal hooks, and a singalong chorus",
3+
"vocal_language": "fr"
34
"inference_steps": 8,
5+
"guidance_scale": 1.0,
46
"shift": 3.0,
5-
"vocal_language": "fr"
67
}

tests/debug-dit-cossim.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"sft": {
2424
"gguf_base": "acestep-v15-sft",
2525
"config_path": "acestep-v15-sft",
26-
"steps": 50, "shift": 1.0, "guidance": 7.0,
26+
"steps": 50, "shift": 1.0, "guidance": 1.0,
2727
},
2828
}
2929

@@ -146,7 +146,7 @@ def run_python(dump_dir, req, cfg, lora_dir=None):
146146
from acestep.handler import AceStepHandler
147147

148148
os.makedirs(dump_dir, exist_ok=True)
149-
has_cfg = cfg["guidance"] > 0
149+
has_cfg = cfg["guidance"] > 1.0
150150

151151
caption = req["caption"]
152152
lyrics = req.get("lyrics", "")
@@ -319,7 +319,7 @@ def detok_hook(module, input, output):
319319
# comparison
320320

321321
def build_stages(cfg):
322-
has_cfg = cfg["guidance"] > 0
322+
has_cfg = cfg["guidance"] > 1.0
323323
steps = cfg["steps"]
324324
stages = [
325325
"text_hidden", "lyric_embed", "enc_hidden", "detok_output", "context", "noise",
@@ -434,7 +434,7 @@ def run_mode(mode_name, cfg, req, gguf_path, lora_dir=None):
434434

435435
tag = mode_name.upper() if mode_name == "sft" else mode_name.capitalize()
436436
cfg_str = f"steps={cfg['steps']}, shift={cfg['shift']}"
437-
if cfg['guidance'] > 0:
437+
if cfg['guidance'] > 1.0:
438438
cfg_str += f", CFG={cfg['guidance']}"
439439
print(f"[{tag}] {cfg_str} | {os.path.basename(gguf_path)}")
440440

tools/ace-understand.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,24 @@ int main(int argc, char ** argv) {
583583
out.timesignature = parsed.timesignature;
584584
out.vocal_language = parsed.vocal_language;
585585
out.audio_codes = codes_str;
586+
587+
// Set DiT defaults from model type (turbo vs SFT/base)
588+
if (dit_gguf) {
589+
GGUFModel gf = {};
590+
if (gf_load(&gf, dit_gguf)) {
591+
if (gf_get_bool(gf, "acestep.is_turbo")) {
592+
out.inference_steps = 8;
593+
out.shift = 3.0f;
594+
out.guidance_scale = 1.0f;
595+
} else {
596+
out.inference_steps = 50;
597+
out.shift = 1.0f;
598+
out.guidance_scale = 1.0f;
599+
}
600+
gf_close(&gf);
601+
}
602+
}
603+
586604
request_write(&out, output_path);
587605
}
588606

0 commit comments

Comments
 (0)