Add support for multiturn fewshot examples and chat templates

Added two new arguments to the evaluation script: `--fewshot-as-multiturn` and `--apply-chat-template` which correspond to lm_eval options of similar names and are very often used to ensure apples-to-apples comparisons of lm_evaluation results
2025-08-29 12:26:07 +08:00 · 2024-12-23 10:45:17 -05:00 · 2024-12-23 10:45:17 -05:00 · d352074e73
commit d352074e73
parent c4833a2f55
1 changed files with 14 additions and 0 deletions
--- a/llms/mlx_lm/evaluate.py
+++ b/llms/mlx_lm/evaluate.py
@ -332,6 +332,18 @@ def main():
        type=float,
    )
    parser.add_argument("--seed", type=int, default=123, help="Random seed.")
    parser.add_argument(
        "--fewshot-as-multiturn",
        action="store_true",
        help="Whether to provide the fewshot examples as a multiturn conversation or a single user turn.",
        default=False,
    )
    parser.add_argument(
        "--apply-chat-template",
        action="store_true",
        help="Specifies whether to apply a chat template to the prompt",
        default=False,
    )
    args = parser.parse_args()
    output_dir = Path(args.output_dir)
@ -347,6 +359,8 @@ def main():
    results = lm_eval.simple_evaluate(
        model=lm,
        tasks=args.tasks,
        fewshot_as_multiturn=args.fewshot_as_multiturn,
        apply_chat_template=args.apply_chat_template,
        num_fewshot=args.num_shots,
        limit=args.limit,
        random_seed=args.seed,