nits and doc

2025-09-01 04:14:38 +08:00 · 2024-09-29 19:21:53 -07:00
parent 6d46cbb4a6
commit 3b1c70ea2d
3 changed files with 19 additions and 14 deletions
--- a/llms/mlx_lm/LORA.md
+++ b/llms/mlx_lm/LORA.md
@@ -196,7 +196,13 @@ To use Hugging Face datasets, first install the `datasets` package:
 pip install datasets
 ```

-Specify the Hugging Face dataset arguments in a YAML config. For example:
+If the Hugging Face dataset is already in a supported format, you can specify
+it on the command line. For example, pass `--data mlx-community/wikisql` to
+train on the pre-formatted WikiwSQL data.
+
+Otherwise, provide a mapping of keys in the dataset to the features MLX LM
+expects. Use a YAML config to specify the Hugging Face dataset arguments. For
+example:

 ```
 hf_dataset:
--- a/llms/mlx_lm/lora.py
+++ b/llms/mlx_lm/lora.py
@@ -79,7 +79,10 @@ def build_parser():
    parser.add_argument(
        "--data",
        type=str,
-        help="Directory with {train, valid, test}.jsonl files or the name of a Huggingface dataset (e.g., 'mlx-community/wikisql')",
+        help=(
+            "Directory with {train, valid, test}.jsonl files or the name "
+            "of a Hugging Face dataset (e.g., 'mlx-community/wikisql')"
+        ),
    )
    parser.add_argument(
        "--lora-layers",
--- a/llms/mlx_lm/tuner/datasets.py
+++ b/llms/mlx_lm/tuner/datasets.py
@@ -89,20 +89,16 @@ def create_dataset(data, tokenizer: PreTrainedTokenizer = None):
        )


-def load_local_data(path: Path, tokenizer: PreTrainedTokenizer):
-    if not path.exists():
-        return []
-    with open(path, "r") as fid:
-        data = [json.loads(l) for l in fid]
-
-    return create_dataset(data, tokenizer)
-
-
 def load_local_dataset(data_path: Path, tokenizer: PreTrainedTokenizer):
+    def load_subset(path):
+        if not path.exists():
+            return []
+        with open(path, "r") as fid:
+            data = [json.loads(l) for l in fid]
+        return create_dataset(data, tokenizer)
+
    names = ("train", "valid", "test")
-    train, valid, test = [
-        load_local_data(data_path / f"{n}.jsonl", tokenizer) for n in names
-    ]
+    train, valid, test = [load_subset(data_path / f"{n}.jsonl") for n in names]
    return train, valid, test