mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 12:49:50 +08:00
add deepseek coder example (#172)
* feat: add example for deepseek coder * chore: remove hardcoded rope_scaling_factor * feat: add quantization support * chore: update readme * chore: clean up the rope scalling factor param in create cos sin theta * feat: add repetition_penalty * style /consistency changes to ease future integration * nits in README * one more typo --------- Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import torch
|
||||
from llama import Llama, ModelArgs, sanitize_config
|
||||
from mlx.utils import tree_flatten, tree_map, tree_unflatten
|
||||
|
||||
|
||||
def llama(model_path):
|
||||
SHARD_FIRST = ["wv", "wq", "wk", "w1", "w3", "output"]
|
||||
SHARD_SECOND = ["tok_embeddings", "wo", "w2"]
|
||||
@@ -185,13 +186,13 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--q_group_size",
|
||||
"--q-group-size",
|
||||
help="Group size for quantization.",
|
||||
type=int,
|
||||
default=64,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--q_bits",
|
||||
"--q-bits",
|
||||
help="Bits per weight for quantization.",
|
||||
type=int,
|
||||
default=4,
|
||||
|
Reference in New Issue
Block a user