Skip to content

Commit bfbb929

Browse files
authored
feat: do not convert bf16 to f32 (#1055)
1 parent 689e44c commit bfbb929

File tree

4 files changed

+6
-33
lines changed

4 files changed

+6
-33
lines changed

docs/flux.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
1515

1616
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
1717

18-
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
18+
For example:
1919
```
2020
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
2121
```

model.cpp

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,6 @@ bool is_unused_tensor(std::string name) {
123123
return false;
124124
}
125125

126-
float bf16_to_f32(uint16_t bfloat16) {
127-
uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
128-
return *reinterpret_cast<float*>(&val_bits);
129-
}
130-
131126
uint16_t f8_e4m3_to_f16(uint8_t f8) {
132127
// do we need to support uz?
133128

@@ -210,13 +205,6 @@ uint16_t f8_e5m2_to_f16(uint8_t fp8) {
210205
return fp16_sign | (fp16_exponent << 10) | fp16_mantissa;
211206
}
212207

213-
void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
214-
// support inplace op
215-
for (int64_t i = n - 1; i >= 0; i--) {
216-
dst[i] = bf16_to_f32(src[i]);
217-
}
218-
}
219-
220208
void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
221209
// support inplace op
222210
for (int64_t i = n - 1; i >= 0; i--) {
@@ -495,7 +483,7 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
495483
if (dtype == "F16") {
496484
ttype = GGML_TYPE_F16;
497485
} else if (dtype == "BF16") {
498-
ttype = GGML_TYPE_F32;
486+
ttype = GGML_TYPE_BF16;
499487
} else if (dtype == "F32") {
500488
ttype = GGML_TYPE_F32;
501489
} else if (dtype == "F64") {
@@ -623,10 +611,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
623611

624612
size_t tensor_data_size = end - begin;
625613

626-
if (dtype == "BF16") {
627-
tensor_storage.is_bf16 = true;
628-
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
629-
} else if (dtype == "F8_E4M3") {
614+
if (dtype == "F8_E4M3") {
630615
tensor_storage.is_f8_e4m3 = true;
631616
// f8 -> f16
632617
GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
@@ -1522,9 +1507,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
15221507
read_time_ms.fetch_add(t1 - t0);
15231508

15241509
t0 = ggml_time_ms();
1525-
if (tensor_storage.is_bf16) {
1526-
bf16_to_f32_vec((uint16_t*)read_buf, (float*)target_buf, tensor_storage.nelements());
1527-
} else if (tensor_storage.is_f8_e4m3) {
1510+
if (tensor_storage.is_f8_e4m3) {
15281511
f8_e4m3_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());
15291512
} else if (tensor_storage.is_f8_e5m2) {
15301513
f8_e5m2_to_f16_vec((uint8_t*)read_buf, (uint16_t*)target_buf, tensor_storage.nelements());

model.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ struct TensorStorage {
168168
std::string name;
169169
ggml_type type = GGML_TYPE_F32;
170170
ggml_type expected_type = GGML_TYPE_COUNT;
171-
bool is_bf16 = false;
172171
bool is_f8_e4m3 = false;
173172
bool is_f8_e5m2 = false;
174173
bool is_f64 = false;
@@ -202,7 +201,7 @@ struct TensorStorage {
202201
}
203202

204203
int64_t nbytes_to_read() const {
205-
if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
204+
if (is_f8_e4m3 || is_f8_e5m2) {
206205
return nbytes() / 2;
207206
} else if (is_f64 || is_i64) {
208207
return nbytes() * 2;
@@ -250,9 +249,7 @@ struct TensorStorage {
250249
std::string to_string() const {
251250
std::stringstream ss;
252251
const char* type_name = ggml_type_name(type);
253-
if (is_bf16) {
254-
type_name = "bf16";
255-
} else if (is_f8_e4m3) {
252+
if (is_f8_e4m3) {
256253
type_name = "f8_e4m3";
257254
} else if (is_f8_e5m2) {
258255
type_name = "f8_e5m2";

stable-diffusion.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -307,13 +307,6 @@ class StableDiffusionGGML {
307307
}
308308

309309
auto& tensor_storage_map = model_loader.get_tensor_storage_map();
310-
for (auto& [name, tensor_storage] : tensor_storage_map) {
311-
if (contains(name, "llm") &&
312-
ends_with(name, "weight") &&
313-
(tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
314-
tensor_storage.expected_type = GGML_TYPE_F16;
315-
}
316-
}
317310

318311
LOG_INFO("Version: %s ", model_version_to_str[version]);
319312
ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)

0 commit comments

Comments
 (0)