mirror of
https://github.com/ml-explore/mlx-examples.git
synced 2025-09-01 04:14:38 +08:00
nits and doc
This commit is contained in:
@@ -196,7 +196,13 @@ To use Hugging Face datasets, first install the `datasets` package:
|
||||
pip install datasets
|
||||
```
|
||||
|
||||
Specify the Hugging Face dataset arguments in a YAML config. For example:
|
||||
If the Hugging Face dataset is already in a supported format, you can specify
|
||||
it on the command line. For example, pass `--data mlx-community/wikisql` to
|
||||
train on the pre-formatted WikiwSQL data.
|
||||
|
||||
Otherwise, provide a mapping of keys in the dataset to the features MLX LM
|
||||
expects. Use a YAML config to specify the Hugging Face dataset arguments. For
|
||||
example:
|
||||
|
||||
```
|
||||
hf_dataset:
|
||||
|
@@ -79,7 +79,10 @@ def build_parser():
|
||||
parser.add_argument(
|
||||
"--data",
|
||||
type=str,
|
||||
help="Directory with {train, valid, test}.jsonl files or the name of a Huggingface dataset (e.g., 'mlx-community/wikisql')",
|
||||
help=(
|
||||
"Directory with {train, valid, test}.jsonl files or the name "
|
||||
"of a Hugging Face dataset (e.g., 'mlx-community/wikisql')"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lora-layers",
|
||||
|
@@ -89,20 +89,16 @@ def create_dataset(data, tokenizer: PreTrainedTokenizer = None):
|
||||
)
|
||||
|
||||
|
||||
def load_local_data(path: Path, tokenizer: PreTrainedTokenizer):
|
||||
if not path.exists():
|
||||
return []
|
||||
with open(path, "r") as fid:
|
||||
data = [json.loads(l) for l in fid]
|
||||
|
||||
return create_dataset(data, tokenizer)
|
||||
|
||||
|
||||
def load_local_dataset(data_path: Path, tokenizer: PreTrainedTokenizer):
|
||||
def load_subset(path):
|
||||
if not path.exists():
|
||||
return []
|
||||
with open(path, "r") as fid:
|
||||
data = [json.loads(l) for l in fid]
|
||||
return create_dataset(data, tokenizer)
|
||||
|
||||
names = ("train", "valid", "test")
|
||||
train, valid, test = [
|
||||
load_local_data(data_path / f"{n}.jsonl", tokenizer) for n in names
|
||||
]
|
||||
train, valid, test = [load_subset(data_path / f"{n}.jsonl") for n in names]
|
||||
return train, valid, test
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user