mlx-examples/stable_diffusion/image2image.py

# Copyright © 2023 Apple Inc.

import argparse

import mlx.core as mx
import numpy as np
from PIL import Image
from tqdm import tqdm

from stable_diffusion import StableDiffusion

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate images from an image and a textual prompt using stable diffusion"
    )
    parser.add_argument("image")
    parser.add_argument("prompt")
    parser.add_argument("--strength", type=float, default=0.9)
    parser.add_argument("--n_images", type=int, default=4)
    parser.add_argument("--steps", type=int, default=50)
    parser.add_argument("--cfg", type=float, default=7.5)
    parser.add_argument("--negative_prompt", default="")
    parser.add_argument("--n_rows", type=int, default=1)
    parser.add_argument("--decoding_batch_size", type=int, default=1)
    parser.add_argument("--quantize", "-q", action="store_true")
    parser.add_argument("--no-float16", dest="float16", action="store_false")
    parser.add_argument("--preload-models", action="store_true")
    parser.add_argument("--output", default="out.png")
    parser.add_argument("--verbose", "-v", action="store_true")
    args = parser.parse_args()

    sd = StableDiffusion("stabilityai/stable-diffusion-2-1-base", float16=args.float16)
    if args.quantize:
        QuantizedLinear.quantize_module(sd.text_encoder)
        QuantizedLinear.quantize_module(sd.unet, group_size=32, bits=8)
    if args.preload_models:
        sd.ensure_models_are_loaded()

    # Read the image
    img = Image.open(args.image)

    # Make sure image shape is divisible by 64
    W, H = (dim - dim % 64 for dim in (img.width, img.height))
    if W != img.width or H != img.height:
        print(f"Warning: image shape is not divisible by 64, downsampling to {W}x{H}")
        img = img.resize((W, H), Image.NEAREST)  # use desired downsampling filter

    img = mx.array(np.array(img))
    img = (img[:, :, :3].astype(mx.float32) / 255) * 2 - 1

    # Noise and denoise the latents produced by encoding img.
    latents = sd.generate_latents_from_image(
        img,
        args.prompt,
        strength=args.strength,
        n_images=args.n_images,
        cfg_weight=args.cfg,
        num_steps=args.steps,
        negative_text=args.negative_prompt,
    )
    for x_t in tqdm(latents, total=int(args.steps * args.strength)):
        mx.eval(x_t)

    # The following is not necessary but it may help in memory
    # constrained systems by reusing the memory kept by the unet and the text
    # encoders.
    del sd.text_encoder
    del sd.unet
    del sd.sampler
    peak_mem_unet = mx.metal.get_peak_memory() / 1024**3

    # Decode them into images
    decoded = []
    for i in tqdm(range(0, args.n_images, args.decoding_batch_size)):
        decoded.append(sd.decode(x_t[i : i + args.decoding_batch_size]))
        mx.eval(decoded[-1])
    peak_mem_overall = mx.metal.get_peak_memory() / 1024**3

    # Arrange them on a grid
    x = mx.concatenate(decoded, axis=0)
    x = mx.pad(x, [(0, 0), (8, 8), (8, 8), (0, 0)])
    B, H, W, C = x.shape
    x = x.reshape(args.n_rows, B // args.n_rows, H, W, C).transpose(0, 2, 1, 3, 4)
    x = x.reshape(args.n_rows * H, B // args.n_rows * W, C)
    x = (x * 255).astype(mx.uint8)

    # Save them to disc
    im = Image.fromarray(np.array(x))
    im.save(args.output)

    # Report the peak memory used during generation
    if args.verbose:
        print(f"Peak memory used for the unet: {peak_mem_unet:.3f}GB")
        print(f"Peak memory used overall:      {peak_mem_overall:.3f}GB")
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00			`# Copyright © 2023 Apple Inc.`

			`import argparse`

			`import mlx.core as mx`
			`import numpy as np`
			`from PIL import Image`
			`from tqdm import tqdm`

			`from stable_diffusion import StableDiffusion`

			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(`
			`description="Generate images from an image and a textual prompt using stable diffusion"`
			`)`
			`parser.add_argument("image")`
			`parser.add_argument("prompt")`
			`parser.add_argument("--strength", type=float, default=0.9)`
			`parser.add_argument("--n_images", type=int, default=4)`
			`parser.add_argument("--steps", type=int, default=50)`
			`parser.add_argument("--cfg", type=float, default=7.5)`
			`parser.add_argument("--negative_prompt", default="")`
			`parser.add_argument("--n_rows", type=int, default=1)`
			`parser.add_argument("--decoding_batch_size", type=int, default=1)`
Stable diffusion XL (#516) 2024-03-09 02:24:19 +08:00			`parser.add_argument("--quantize", "-q", action="store_true")`
			`parser.add_argument("--no-float16", dest="float16", action="store_false")`
			`parser.add_argument("--preload-models", action="store_true")`
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00			`parser.add_argument("--output", default="out.png")`
Stable diffusion XL (#516) 2024-03-09 02:24:19 +08:00			`parser.add_argument("--verbose", "-v", action="store_true")`
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00			`args = parser.parse_args()`

Stable diffusion XL (#516) 2024-03-09 02:24:19 +08:00			`sd = StableDiffusion("stabilityai/stable-diffusion-2-1-base", float16=args.float16)`
			`if args.quantize:`
			`QuantizedLinear.quantize_module(sd.text_encoder)`
			`QuantizedLinear.quantize_module(sd.unet, group_size=32, bits=8)`
			`if args.preload_models:`
			`sd.ensure_models_are_loaded()`
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00
			`# Read the image`
Stable Diffusion: Input image downsampling (#276) 2024-01-17 05:45:00 +08:00			`img = Image.open(args.image)`

			`# Make sure image shape is divisible by 64`
			`W, H = (dim - dim % 64 for dim in (img.width, img.height))`
			`if W != img.width or H != img.height:`
			`print(f"Warning: image shape is not divisible by 64, downsampling to {W}x{H}")`
			`img = img.resize((W, H), Image.NEAREST) # use desired downsampling filter`

			`img = mx.array(np.array(img))`
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00			`img = (img[:, :, :3].astype(mx.float32) / 255) * 2 - 1`

			`# Noise and denoise the latents produced by encoding img.`
			`latents = sd.generate_latents_from_image(`
			`img,`
			`args.prompt,`
			`strength=args.strength,`
			`n_images=args.n_images,`
			`cfg_weight=args.cfg,`
			`num_steps=args.steps,`
			`negative_text=args.negative_prompt,`
			`)`
			`for x_t in tqdm(latents, total=int(args.steps * args.strength)):`
			`mx.eval(x_t)`

Stable diffusion XL (#516) 2024-03-09 02:24:19 +08:00			`# The following is not necessary but it may help in memory`
			`# constrained systems by reusing the memory kept by the unet and the text`
			`# encoders.`
			`del sd.text_encoder`
			`del sd.unet`
			`del sd.sampler`
			`peak_mem_unet = mx.metal.get_peak_memory() / 1024**3`

Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00			`# Decode them into images`
			`decoded = []`
			`for i in tqdm(range(0, args.n_images, args.decoding_batch_size)):`
			`decoded.append(sd.decode(x_t[i : i + args.decoding_batch_size]))`
			`mx.eval(decoded[-1])`
Stable diffusion XL (#516) 2024-03-09 02:24:19 +08:00			`peak_mem_overall = mx.metal.get_peak_memory() / 1024**3`
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00
			`# Arrange them on a grid`
			`x = mx.concatenate(decoded, axis=0)`
			`x = mx.pad(x, [(0, 0), (8, 8), (8, 8), (0, 0)])`
			`B, H, W, C = x.shape`
			`x = x.reshape(args.n_rows, B // args.n_rows, H, W, C).transpose(0, 2, 1, 3, 4)`
			`x = x.reshape(args.n_rows * H, B // args.n_rows * W, C)`
			`x = (x * 255).astype(mx.uint8)`

			`# Save them to disc`
two minor fixes (#335) 2024-01-19 06:18:13 +08:00			`im = Image.fromarray(np.array(x))`
Add an image2image example in the stable diffusion (#198) 2023-12-29 10:31:45 +08:00			`im.save(args.output)`
Stable diffusion XL (#516) 2024-03-09 02:24:19 +08:00
			`# Report the peak memory used during generation`
			`if args.verbose:`
			`print(f"Peak memory used for the unet: {peak_mem_unet:.3f}GB")`
			`print(f"Peak memory used overall: {peak_mem_overall:.3f}GB")`