nits and doc

This commit is contained in:
Awni Hannun
2024-09-29 19:21:53 -07:00
parent 6d46cbb4a6
commit 3b1c70ea2d
3 changed files with 19 additions and 14 deletions

View File

@@ -196,7 +196,13 @@ To use Hugging Face datasets, first install the `datasets` package:
pip install datasets
```
Specify the Hugging Face dataset arguments in a YAML config. For example:
If the Hugging Face dataset is already in a supported format, you can specify
it on the command line. For example, pass `--data mlx-community/wikisql` to
train on the pre-formatted WikiwSQL data.
Otherwise, provide a mapping of keys in the dataset to the features MLX LM
expects. Use a YAML config to specify the Hugging Face dataset arguments. For
example:
```
hf_dataset:

View File

@@ -79,7 +79,10 @@ def build_parser():
parser.add_argument(
"--data",
type=str,
help="Directory with {train, valid, test}.jsonl files or the name of a Huggingface dataset (e.g., 'mlx-community/wikisql')",
help=(
"Directory with {train, valid, test}.jsonl files or the name "
"of a Hugging Face dataset (e.g., 'mlx-community/wikisql')"
),
)
parser.add_argument(
"--lora-layers",

View File

@@ -89,20 +89,16 @@ def create_dataset(data, tokenizer: PreTrainedTokenizer = None):
)
def load_local_data(path: Path, tokenizer: PreTrainedTokenizer):
if not path.exists():
return []
with open(path, "r") as fid:
data = [json.loads(l) for l in fid]
return create_dataset(data, tokenizer)
def load_local_dataset(data_path: Path, tokenizer: PreTrainedTokenizer):
def load_subset(path):
if not path.exists():
return []
with open(path, "r") as fid:
data = [json.loads(l) for l in fid]
return create_dataset(data, tokenizer)
names = ("train", "valid", "test")
train, valid, test = [
load_local_data(data_path / f"{n}.jsonl", tokenizer) for n in names
]
train, valid, test = [load_subset(data_path / f"{n}.jsonl") for n in names]
return train, valid, test