import os import pandas as pd from datasets import load_dataset def load_legal_dataset(): """ Loads a small portion of the CUAD dataset (contract clauses). Falls back to a tiny synthetic sample if the dataset is unavailable (e.g., offline). """ try: dataset = load_dataset("cuad", "cuad_v1", split="train[:200]") df = pd.DataFrame(dataset) df["question_text"] = "Summarize the key legal clause: " + df["question_text"] df["answer"] = df["answers"].apply(lambda a: a[0]["text"][0] if a and a[0]["text"] else "") data = df[["question_text", "answer"]].rename(columns={"question_text": "question"}) except Exception as exc: # pragma: no cover - offline/sandbox fallback print(f"⚠️ Unable to load CUAD from Hub ({exc}). Using synthetic sample.") data = pd.DataFrame( [ { "question": "Summarize the confidentiality clause: The parties agree to keep all proprietary information confidential for five years.", "answer": "Both parties must keep proprietary info secret for five years.", }, { "question": "Summarize the termination clause: Either party may terminate with 30 days written notice without cause.", "answer": "Either side can end the agreement with 30 days written notice.", }, { "question": "Summarize the liability clause: Liability is limited to direct damages not exceeding fees paid in the last 12 months.", "answer": "Each party's liability is capped to direct damages up to fees from the past year.", }, ] ) os.makedirs("datasets", exist_ok=True) data.to_json("datasets/legal_sample.jsonl", orient="records", lines=True) print("✅ Saved sample dataset to datasets/legal_sample.jsonl") return data if __name__ == "__main__": load_legal_dataset()