mlx-examples/llms/mlx_lm/convert2.py
2025-02-05 18:50:14 -08:00

19 lines
593 B
Python

import pandas as pd
import os
# Define dataset directory
dataset_dir = "/Users/cshang/Desktop/test_grpo/data"
# Convert each Parquet file to JSONL
for file in os.listdir(dataset_dir):
if file.endswith(".parquet"):
parquet_path = os.path.join(dataset_dir, file)
jsonl_path = os.path.join(dataset_dir, file.replace(".parquet", ".jsonl"))
# Load Parquet file
df = pd.read_parquet(parquet_path)
# Convert to JSONL format
df.to_json(jsonl_path, orient="records", lines=True)
print(f"Converted {parquet_path} -> {jsonl_path}")