From 8c9148a8fdfcc7ac1dd2c55a6dd3ff1e81ca33cc Mon Sep 17 00:00:00 2001
From: Ovid <24634+Ovid@users.noreply.github.com>
Date: Tue, 20 Feb 2024 05:11:45 +0100
Subject: [PATCH] Make it easier to know in which file we have bad JSON data
 (#458)

* Make it easier to know which file we have bad JSON data in.

* Use a loop rather than repeat code sections.

I previously had these as separate cut-n-drooled sections of code. This change makes it a clean loop.

Co-authored-by: Awni Hannun <awni.hannun@gmail.com>

* Small fix to previous code suggestion to restore a missing variable.

---------

Co-authored-by: Awni Hannun <awni.hannun@gmail.com>
---
 lora/lora.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/lora/lora.py b/lora/lora.py
index d0ff032b..1d847e97 100644
--- a/lora/lora.py
+++ b/lora/lora.py
@@ -138,8 +138,17 @@ class Dataset:
 
 
 def load(args):
+    def load_and_check(name):
+         dataset_path = Path(args.data) / f"{name}.jsonl"
+         try:
+             train = Dataset(dataset_path)
+         except Exception as e:
+             print(f"Unable to build dataset {dataset_path} ({e})")
+             raise
+
     names = ("train", "valid", "test")
-    train, valid, test = (Dataset(Path(args.data) / f"{n}.jsonl") for n in names)
+    train, valid, test = (load_and_check(n) for n in names)
+
     if args.train and len(train) == 0:
         raise ValueError(
             "Training set not found or empty. Must provide training set for fine-tuning."