From 8c9148a8fdfcc7ac1dd2c55a6dd3ff1e81ca33cc Mon Sep 17 00:00:00 2001 From: Ovid <24634+Ovid@users.noreply.github.com> Date: Tue, 20 Feb 2024 05:11:45 +0100 Subject: [PATCH] Make it easier to know in which file we have bad JSON data (#458) * Make it easier to know which file we have bad JSON data in. * Use a loop rather than repeat code sections. I previously had these as separate cut-n-drooled sections of code. This change makes it a clean loop. Co-authored-by: Awni Hannun * Small fix to previous code suggestion to restore a missing variable. --------- Co-authored-by: Awni Hannun --- lora/lora.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lora/lora.py b/lora/lora.py index d0ff032b..1d847e97 100644 --- a/lora/lora.py +++ b/lora/lora.py @@ -138,8 +138,17 @@ class Dataset: def load(args): + def load_and_check(name): + dataset_path = Path(args.data) / f"{name}.jsonl" + try: + train = Dataset(dataset_path) + except Exception as e: + print(f"Unable to build dataset {dataset_path} ({e})") + raise + names = ("train", "valid", "test") - train, valid, test = (Dataset(Path(args.data) / f"{n}.jsonl") for n in names) + train, valid, test = (load_and_check(n) for n in names) + if args.train and len(train) == 0: raise ValueError( "Training set not found or empty. Must provide training set for fine-tuning."