Make it easier to know in which file we have bad JSON data (#458)

* Make it easier to know which file we have bad JSON data in. * Use a loop rather than repeat code sections. I previously had these as separate cut-n-drooled sections of code. This change makes it a clean loop. Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * Small fix to previous code suggestion to restore a missing variable. --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com>
2025-12-13 16:38:59 +08:00 · 2024-02-20 05:11:45 +01:00
parent 88458c4e40
commit 8c9148a8fd
1 changed files with 10 additions and 1 deletions
--- a/lora/lora.py
+++ b/lora/lora.py
@@ -138,8 +138,17 @@ class Dataset:


 def load(args):
+    def load_and_check(name):
+         dataset_path = Path(args.data) / f"{name}.jsonl"
+         try:
+             train = Dataset(dataset_path)
+         except Exception as e:
+             print(f"Unable to build dataset {dataset_path} ({e})")
+             raise
+
    names = ("train", "valid", "test")
-    train, valid, test = (Dataset(Path(args.data) / f"{n}.jsonl") for n in names)
+    train, valid, test = (load_and_check(n) for n in names)
+
    if args.train and len(train) == 0:
        raise ValueError(
            "Training set not found or empty. Must provide training set for fine-tuning."