diff --git a/stable_diffusion/README.md b/stable_diffusion/README.md index ab7affa3..385ce6d9 100644 --- a/stable_diffusion/README.md +++ b/stable_diffusion/README.md @@ -67,7 +67,7 @@ Image 2 Image There is also the option of generating images based on another image using the example script `image2image.py`. To do that an image is first encoded using the autoencoder to get its latent representation and then noise is added according -to the forward diffusion process and the `strength` parameter. A `stregnth` of +to the forward diffusion process and the `strength` parameter. A `strength` of 0.0 means no noise and a `strength` of 1.0 means starting from completely random noise. @@ -78,6 +78,7 @@ The command to generate the above images is: python image2image.py --strength 0.5 original.png 'A lit fireplace' +*Note: `image2image.py` will automatically downsample your input image to guarantee that its dimensions are divisible by 64. If you want full control of this process, resize your image prior to using the script.* Performance ----------- diff --git a/stable_diffusion/image2image.py b/stable_diffusion/image2image.py index ca325d48..bb4c9442 100644 --- a/stable_diffusion/image2image.py +++ b/stable_diffusion/image2image.py @@ -28,7 +28,15 @@ if __name__ == "__main__": sd = StableDiffusion() # Read the image - img = mx.array(np.array(Image.open(args.image))) + img = Image.open(args.image) + + # Make sure image shape is divisible by 64 + W, H = (dim - dim % 64 for dim in (img.width, img.height)) + if W != img.width or H != img.height: + print(f"Warning: image shape is not divisible by 64, downsampling to {W}x{H}") + img = img.resize((W, H), Image.NEAREST) # use desired downsampling filter + + img = mx.array(np.array(img)) img = (img[:, :, :3].astype(mx.float32) / 255) * 2 - 1 # Noise and denoise the latents produced by encoding img.