Start memory-efficient flux finetuning branch

2025-12-16 02:08:55 +08:00 · 2024-10-25 09:46:47 -07:00
parent 4971462bf0
commit 67607a8e13
4 changed files with 101 additions and 6 deletions
--- a/flux/dreambooth.py
+++ b/flux/dreambooth.py
@@ -16,6 +16,10 @@ from PIL import Image
 from flux import FluxPipeline, Trainer, load_dataset
 def quantization_predicate(name, m):
    return hasattr(m, "to_quantized") and m.weight.shape[1] % 512 == 0
 def generate_progress_images(iteration, flux, args):
    """Generate images to monitor the progress of the finetuning."""
    out_dir = Path(args.output_dir)
@@ -24,11 +28,10 @@ def generate_progress_images(iteration, flux, args):
    print(f"Generating {str(out_file)}", flush=True)
    # Generate some images and arrange them in a grid
-    n_rows = 2
+    n_rows = 2 if args.progress_num_images % 2 == 0 else 1
    n_images = 4
    x = flux.generate_images(
        args.progress_prompt,
-        n_images,
+        args.progress_num_images,
        args.progress_steps,
    )
    x = mx.pad(x, [(0, 0), (4, 4), (4, 4), (0, 0)])
@@ -42,6 +45,16 @@ def generate_progress_images(iteration, flux, args):
    im = Image.fromarray(np.array(x))
    im.save(out_file)
    # generate_images reloads the text encoders in order to remove them from
    # RAM. In memory pressured environments this will swap the flow transformer
    # to disk and back to RAM during generation.
    #
    # However, we have to requantize the text encoders for the next time we
    # want to use them.
    if args.quantize:
        nn.quantize(flux.t5, class_predicate=quantization_predicate)
        nn.quantize(flux.clip, class_predicate=quantization_predicate)
 def save_adapters(iteration, flux, args):
    out_dir = Path(args.output_dir)
@@ -74,6 +87,17 @@ def setup_arg_parser():
        ],
        help="Which flux model to train",
    )
    parser.add_argument(
        "--quantize",
        "-q",
        action="store_true",
        help="Quantize the models to reduce the memory required for training",
    )
    parser.add_argument(
        "--gradient-checkpointing",
        action="store_true",
        help="Enable gradient checkpointing to reduce the memory required for training",
    )
    parser.add_argument(
        "--guidance", type=float, default=4.0, help="The guidance factor to use."
    )
@@ -118,6 +142,12 @@ def setup_arg_parser():
        default=50,
        help="Generate images every PROGRESS_EVERY steps",
    )
    parser.add_argument(
        "--progress-num-images",
        type=int,
        default=4,
        help="How many progress images to generate",
    )
    parser.add_argument(
        "--checkpoint-every",
        type=int,
@@ -162,6 +192,14 @@ if __name__ == "__main__":
    # initial weights.
    mx.random.seed(0x0F0F0F0F)
    flux = FluxPipeline("flux-" + args.model)
    if args.quantize:
        nn.quantize(flux.flow, class_predicate=quantization_predicate)
        nn.quantize(flux.t5, class_predicate=quantization_predicate)
        nn.quantize(flux.clip, class_predicate=quantization_predicate)
    if args.gradient_checkpointing:
        flux.gradient_checkpointing()
    flux.flow.freeze()
    flux.linear_to_lora_layers(args.lora_rank, args.lora_blocks)
@@ -254,8 +292,12 @@ if __name__ == "__main__":
    guidance = mx.full((args.batch_size,), args.guidance, dtype=flux.dtype)
    # An initial generation to compare
-    generate_progress_images(0, flux, args)
+    # generate_progress_images(0, flux, args)
    flux.reload_text_encoders()
    del flux.t5
    del flux.clip
    mx.metal.reset_peak_memory()
    grads = None
    losses = []
    tic = time.time()
--- a/flux/flux/flux.py
+++ b/flux/flux/flux.py
@@ -7,6 +7,12 @@ import mlx.nn as nn
 from mlx.utils import tree_unflatten
 from tqdm import tqdm
 from .layers import (
    DoubleStreamBlock,
    SingleStreamBlock,
    disable_gradient_checkpointing,
    enable_gradient_checkpointing,
 )
 from .lora import LoRALinear
 from .sampler import FluxSampler
 from .utils import (
@@ -234,7 +240,7 @@ class FluxPipeline:
        for i, block in zip(range(num_blocks), all_blocks):
            loras = []
            for name, module in block.named_modules():
-                if isinstance(module, nn.Linear):
+                if isinstance(module, (nn.Linear, nn.QuantizedLinear)):
                    loras.append((name, LoRALinear.from_base(module, r=rank)))
            block.update_modules(tree_unflatten(loras))
@@ -244,3 +250,13 @@ class FluxPipeline:
            if isinstance(module, LoRALinear):
                fused_layers.append((name, module.fuse()))
        self.flow.update_modules(tree_unflatten(fused_layers))
    def gradient_checkpointing(self, enable: bool = True):
        """Replace the call function of SingleStreamBlock and DoubleStreamBlock
        to a checkpointing one."""
        if enable:
            enable_gradient_checkpointing(SingleStreamBlock)
            enable_gradient_checkpointing(DoubleStreamBlock)
        else:
            disable_gradient_checkpointing(SingleStreamBlock)
            disable_gradient_checkpointing(DoubleStreamBlock)
--- a/flux/flux/layers.py
+++ b/flux/flux/layers.py
@@ -9,6 +9,37 @@ import mlx.core as mx
 import mlx.nn as nn
 def enable_gradient_checkpointing(module_class):
    if hasattr(module_class, "_original_call"):
        raise ValueError(
            f"Gradient checkpointing is already enabled for {module_class.__name__}"
        )
    fn = module_class.__call__
    module_class._original_call = fn
    def checkpointed_fn(module_instance, *args, **kwargs):
        def inner_fn(params, *args, **kwargs):
            module_instance.update(params)
            return fn(module_instance, *args, **kwargs)
        return mx.checkpoint(inner_fn)(
            module_instance.trainable_parameters(), *args, **kwargs
        )
    module_class.__call__ = checkpointed_fn
 def disable_gradient_checkpointing(module_class):
    if not hasattr(module_class, "_original_call"):
        raise ValueError(
            f"Gradient checkpointing is not enabled for {module_class.__name__}"
        )
    module_class.__call__ = module_class._original_call
    delattr(module_class, "_original_call")
 def _rope(pos: mx.array, dim: int, theta: float):
    scale = mx.arange(0, dim, 2, dtype=mx.float32) / dim
    omega = 1.0 / (theta**scale)
--- a/flux/flux/lora.py
+++ b/flux/flux/lora.py
@@ -9,12 +9,15 @@ import mlx.nn as nn
 class LoRALinear(nn.Module):
    @staticmethod
    def from_base(
-        linear: nn.Linear,
+        linear: nn.Module,
        r: int = 8,
        dropout: float = 0.0,
        scale: float = 1.0,
    ):
        output_dims, input_dims = linear.weight.shape
        if isinstance(linear, nn.QuantizedLinear):
            input_dims *= 32 // linear.bits
        lora_lin = LoRALinear(
            input_dims=input_dims,
            output_dims=output_dims,
@@ -26,6 +29,9 @@ class LoRALinear(nn.Module):
        return lora_lin
    def fuse(self):
        if isinstance(self.linear, nn.QuantizedLinear):
            raise NotImplementedError("Cannot fuse QLoRA layers yet.")
        linear = self.linear
        bias = "bias" in linear
        weight = linear.weight