import mlx_lm # model, tokenizer = mlx_lm.load("mlx-community/SmolLM-1.7B-Instruct-fp16") model, tokenizer = mlx_lm.load("/Users/llwu/models/mlx/Qwen2-0.5B-8bit-Instruct") draft_model, draft_tokenizer = mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit") # https://github.com/hemingkx/Spec-Bench/blob/main/data/spec_bench/question.jsonl prompt = "Develop a Python program that reads all the text files under a directory and returns top-5 words with the most number of occurrences." prompt = tokenizer.apply_chat_template( [{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True, ) mlx_lm.generate( model, tokenizer, prompt=prompt, verbose=True, max_tokens=500, temp=1.0, min_p=0.1, repetition_penalty=1.2, # draft_model=draft_model, )