LoRA: Support HuggingFace dataset via data parameter (#996)

* LoRA: support huggingface dataset via `data` argument * LoRA: Extract the load_custom_hf_dataset function * LoRA: split small functions * fix spelling errors * handle load hf dataset error * fix pre-commit lint * update data argument help * nits and doc --------- Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-16 02:08:55 +08:00 · 2024-09-30 22:36:21 +08:00
parent 50e5ca81a8
commit aa1c8abdc6
3 changed files with 93 additions and 51 deletions
--- a/llms/mlx_lm/tuner/datasets.py
+++ b/llms/mlx_lm/tuner/datasets.py
@@ -76,17 +76,14 @@ class CompletionsDataset(Dataset):
        return text


-def create_dataset(path: Path, tokenizer: PreTrainedTokenizer = None):
-    # Return empty dataset for non-existent paths
-    if not path.exists():
-        return []
-    with open(path, "r") as fid:
-        data = [json.loads(l) for l in fid]
-    if "messages" in data[0]:
+def create_dataset(data, tokenizer: PreTrainedTokenizer = None):
+    sample = data[0]
+
+    if "messages" in sample:
        return ChatDataset(data, tokenizer)
-    elif "prompt" in data[0] and "completion" in data[0]:
+    elif "prompt" in sample and "completion" in sample:
        return CompletionsDataset(data, tokenizer)
-    elif "text" in data[0]:
+    elif "text" in sample:
        return Dataset(data)
    else:
        raise ValueError(
@@ -95,54 +92,90 @@ def create_dataset(path: Path, tokenizer: PreTrainedTokenizer = None):
        )


-def load_dataset(args, tokenizer: PreTrainedTokenizer):
-    if getattr(args, "hf_dataset", None) is not None:
-        import datasets
+def load_local_dataset(data_path: Path, tokenizer: PreTrainedTokenizer):
+    def load_subset(path):
+        if not path.exists():
+            return []
+        with open(path, "r") as fid:
+            data = [json.loads(l) for l in fid]
+        return create_dataset(data, tokenizer)

-        hf_args = args.hf_dataset
-        dataset_name = hf_args["name"]
-        print(f"Loading Hugging Face dataset {dataset_name}.")
-        text_feature = hf_args.get("text_feature")
-        prompt_feature = hf_args.get("prompt_feature")
-        completion_feature = hf_args.get("completion_feature")
+    names = ("train", "valid", "test")
+    train, valid, test = [load_subset(data_path / f"{n}.jsonl") for n in names]
+    return train, valid, test

-        def create_hf_dataset(split: str = None):
-            ds = datasets.load_dataset(
-                dataset_name,
-                split=split,
-                **hf_args.get("config", {}),
-            )
-            if prompt_feature and completion_feature:
-                return CompletionsDataset(
-                    ds, tokenizer, prompt_feature, completion_feature
-                )
-            elif text_feature:
-                return Dataset(train_ds, text_key=text_feature)
-            else:
-                raise ValueError(
-                    "Specify either a prompt and completion feature or a text "
-                    "feature for the Hugging Face dataset."
-                )

-        if args.train:
-            train_split = hf_args.get("train_split", "train[:80%]")
-            valid_split = hf_args.get("valid_split", "train[-10%:]")
-            train = create_hf_dataset(split=train_split)
-            valid = create_hf_dataset(split=valid_split)
-        else:
-            train, valid = [], []
-        if args.test:
-            test = create_hf_dataset(split=hf_args.get("test_split"))
-        else:
-            test = []
+def load_hf_dataset(data_id: str, tokenizer: PreTrainedTokenizer):
+    from datasets import exceptions, load_dataset
+
+    try:
+        dataset = load_dataset(data_id)

-    else:
        names = ("train", "valid", "test")
-        data_path = Path(args.data)

        train, valid, test = [
-            create_dataset(data_path / f"{n}.jsonl", tokenizer) for n in names
+            create_dataset(dataset[n], tokenizer) if n in dataset.keys() else []
+            for n in names
        ]
+
+    except exceptions.DatasetNotFoundError:
+        raise ValueError(f"Not found Hugging Face dataset: {data_id} .")
+
+    return train, valid, test
+
+
+def load_custom_hf_dataset(args, tokenizer: PreTrainedTokenizer):
+    import datasets
+
+    hf_args = args.hf_dataset
+    dataset_name = hf_args["name"]
+    print(f"Loading Hugging Face dataset {dataset_name}.")
+    text_feature = hf_args.get("text_feature")
+    prompt_feature = hf_args.get("prompt_feature")
+    completion_feature = hf_args.get("completion_feature")
+
+    def create_hf_dataset(split: str = None):
+        ds = datasets.load_dataset(
+            dataset_name,
+            split=split,
+            **hf_args.get("config", {}),
+        )
+        if prompt_feature and completion_feature:
+            return CompletionsDataset(ds, tokenizer, prompt_feature, completion_feature)
+        elif text_feature:
+            return Dataset(train_ds, text_key=text_feature)
+        else:
+            raise ValueError(
+                "Specify either a prompt and completion feature or a text "
+                "feature for the Hugging Face dataset."
+            )
+
+    if args.train:
+        train_split = hf_args.get("train_split", "train[:80%]")
+        valid_split = hf_args.get("valid_split", "train[-10%:]")
+        train = create_hf_dataset(split=train_split)
+        valid = create_hf_dataset(split=valid_split)
+    else:
+        train, valid = [], []
+    if args.test:
+        test = create_hf_dataset(split=hf_args.get("test_split"))
+    else:
+        test = []
+
+    return train, valid, test
+
+
+def load_dataset(args, tokenizer: PreTrainedTokenizer):
+    if getattr(args, "hf_dataset", None) is not None:
+        train, valid, test = load_custom_hf_dataset(args, tokenizer)
+    else:
+        data_path = Path(args.data)
+        if data_path.exists():
+            train, valid, test = load_local_dataset(data_path, tokenizer)
+        else:
+            print(f"Loading Hugging Face dataset {args.data}.")
+            train, valid, test = load_hf_dataset(args.data, tokenizer)
+
    if args.train and len(train) == 0:
        raise ValueError(
            "Training set not found or empty. Must provide training set for fine-tuning."