mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-10-24 06:28:07 +08:00
CLIP (ViT) (#315)
* probably approximatelly correct CLIPTextEncoder * implemented CLIPEncoderLayer as built-in nn.TransformerEncoderLayer * replaced embedding layer with simple matrix * implemented ViT * added ViT tests * fixed tests * added pooler_output for text * implemented complete CLIPModel * implemented init * implemented convert.py and from_pretrained * fixed some minor bugs and added the README.md * removed tokenizer unused comments * removed unused deps * updated ACKNOWLEDGEMENTS.md * Feat: Image Processor for CLIP (#1) @nkasmanoff: * clip image processor * added example usage * refactored image preprocessing * deleted unused image_config.py * removed preprocessing port * added dependency to mlx-data * fixed attribution and moved photos to assets * implemented a simple port of CLIPImageProcessor * review changes * PR review changes * renamed too verbose arg * updated README.md * nits in readme / conversion * simplify some stuff, remove unneeded inits * remove more init stuff * more simplify * make test a unit test * update main readme * readme nits --------- Co-authored-by: Noah Kasmanoff <nkasmanoff@gmail.com> Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
committed by
GitHub
parent
ba3a9355d1
commit
94358219cf
31
clip/clip.py
Normal file
31
clip/clip.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from typing import Tuple
|
||||
|
||||
from image_processor import CLIPImageProcessor
|
||||
from model import CLIPModel
|
||||
from tokenizer import CLIPTokenizer
|
||||
|
||||
|
||||
def load(model_dir: str) -> Tuple[CLIPModel, CLIPTokenizer, CLIPImageProcessor]:
|
||||
model = CLIPModel.from_pretrained(model_dir)
|
||||
tokenizer = CLIPTokenizer.from_pretrained(model_dir)
|
||||
img_processor = CLIPImageProcessor.from_pretrained(model_dir)
|
||||
return model, tokenizer, img_processor
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from PIL import Image
|
||||
|
||||
model, tokenizer, img_processor = load("mlx_model")
|
||||
inputs = {
|
||||
"input_ids": tokenizer(["a photo of a cat", "a photo of a dog"]),
|
||||
"pixel_values": img_processor(
|
||||
[Image.open("assets/cat.jpeg"), Image.open("assets/dog.jpeg")]
|
||||
),
|
||||
}
|
||||
output = model(**inputs)
|
||||
|
||||
# Get text and image embeddings:
|
||||
text_embeds = output.text_embeds
|
||||
image_embeds = output.image_embeds
|
||||
print("Text embeddings shape:", text_embeds.shape)
|
||||
print("Image embeddings shape:", image_embeds.shape)
|
||||
Reference in New Issue
Block a user