mlx-examples/clip/test.py
Gabrijel Boduljak 94358219cf
CLIP (ViT) (#315)
* probably approximatelly correct CLIPTextEncoder

* implemented CLIPEncoderLayer as built-in nn.TransformerEncoderLayer

* replaced embedding layer with simple matrix

* implemented ViT

* added ViT tests

* fixed tests

* added pooler_output for text

* implemented complete CLIPModel

* implemented init

* implemented convert.py and from_pretrained

* fixed some minor bugs and added the README.md

* removed tokenizer unused comments

* removed unused deps

* updated ACKNOWLEDGEMENTS.md

* Feat: Image Processor for CLIP (#1)

@nkasmanoff:
* clip image processor
* added example usage

* refactored image preprocessing

* deleted unused image_config.py

* removed preprocessing port

* added dependency to mlx-data

* fixed attribution and moved photos to assets

* implemented a simple port of CLIPImageProcessor

* review changes

* PR review changes

* renamed too verbose arg

* updated README.md

* nits in readme / conversion

* simplify some stuff, remove unneeded inits

* remove more init stuff

* more simplify

* make test a unit test

* update main readme

* readme nits

---------

Co-authored-by: Noah Kasmanoff <nkasmanoff@gmail.com>
Co-authored-by: Awni Hannun <awni@apple.com>
2024-01-31 14:19:53 -08:00

137 lines
4.6 KiB
Python

import unittest
import mlx.core as mx
import model
import numpy as np
import torch
import transformers
from image_processor import CLIPImageProcessor
from PIL import Image
from tokenizer import CLIPTokenizer
from transformers import AutoTokenizer
from transformers.image_processing_utils import ChannelDimension
MLX_PATH = "mlx_model"
HF_PATH = "openai/clip-vit-base-patch32"
def load_mlx_models(path):
image_proc = CLIPImageProcessor.from_pretrained(path)
tokenizer = CLIPTokenizer.from_pretrained(path)
clip = model.CLIPModel.from_pretrained(path)
return image_proc, tokenizer, clip
def load_hf_models(path):
image_proc = transformers.CLIPImageProcessor.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)
clip = transformers.CLIPModel.from_pretrained(path)
return image_proc, tokenizer, clip
class TestCLIP(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.mx_image_proc, cls.mx_tokenizer, cls.mx_clip = load_mlx_models(MLX_PATH)
cls.hf_image_proc, cls.hf_tokenizer, cls.hf_clip = load_hf_models(HF_PATH)
def test_image_processor(self):
image = Image.open("assets/cat.jpeg")
mx_data = self.mx_image_proc([image])
hf_data = mx.array(
np.array(
self.hf_image_proc([image], data_format=ChannelDimension.LAST)[
"pixel_values"
]
)
)
self.assertTrue(mx.allclose(mx_data, hf_data, atol=1e-5))
def test_text_tokenizer(self):
texts = ["a photo of a cat", "a photo of a dog"]
for txt in texts:
self.assertTrue(
np.array_equal(
self.mx_tokenizer.tokenize(txt)[None, :],
self.hf_tokenizer(txt, return_tensors="np")["input_ids"],
),
)
def test_text_encoder(self):
texts = ["a photo of a cat", "a photo of a dog"]
# Tokenize
hf_tokens = self.hf_tokenizer(texts, return_tensors="pt")
mx_tokens = self.mx_tokenizer(texts)
# Get expected
with torch.inference_mode():
expected_out = self.hf_clip.text_model(**hf_tokens)
expected_last_hidden = expected_out.last_hidden_state.numpy()
expected_pooler_output = expected_out.pooler_output.numpy()
out = self.mx_clip.text_model(mx_tokens)
self.assertTrue(
np.allclose(out.last_hidden_state, expected_last_hidden, atol=1e-5)
)
self.assertTrue(
np.allclose(out.pooler_output, expected_pooler_output, atol=1e-5)
)
def test_vision_encoder(self):
# Load and process test image
x = self.hf_image_proc(
images=[Image.open("assets/dog.jpeg")], return_tensors="np"
).pixel_values
# Infer with HuggingFace model
with torch.inference_mode():
# Get expected
x_tc = torch.tensor(x)
expected_out = self.hf_clip.vision_model(x_tc)
expected_last_hidden = expected_out.last_hidden_state.numpy()
expected_pooler_output = expected_out.pooler_output.numpy()
# Test MLX vision encoder
out = self.mx_clip.vision_model(mx.array(x.transpose(0, 2, 3, 1)))
self.assertTrue(
np.allclose(
out.last_hidden_state, expected_last_hidden, rtol=1e-4, atol=1e-3
),
)
self.assertTrue(
np.allclose(
out.pooler_output, expected_pooler_output, rtol=1e-4, atol=1e-3
),
)
def test_clip_model(self):
image_input = self.hf_image_proc(
images=[Image.open("assets/cat.jpeg"), Image.open("assets/dog.jpeg")],
return_tensors="np",
)["pixel_values"]
text = ["a photo of a cat", "a photo of a dog"]
tokens = self.hf_tokenizer(text, return_tensors="np")["input_ids"]
with torch.inference_mode():
expected_out = self.hf_clip(
input_ids=torch.tensor(tokens),
pixel_values=torch.tensor(image_input),
return_loss=True,
)
out = self.mx_clip(
input_ids=mx.array(tokens),
pixel_values=mx.array(image_input.transpose((0, 2, 3, 1))),
return_loss=True,
)
self.assertTrue(
np.allclose(out.text_embeds, expected_out.text_embeds, atol=1e-5)
)
self.assertTrue(
np.allclose(out.image_embeds, expected_out.image_embeds, atol=1e-5)
)
self.assertTrue(np.allclose(out.loss, expected_out.loss, atol=1e-5))
if __name__ == "__main__":
unittest.main()