import os
import pandas as pd
from datasets import load_dataset


def load_legal_dataset():
    """
    Loads a small portion of the CUAD dataset (contract clauses).
    Falls back to a tiny synthetic sample if the dataset is unavailable (e.g., offline).
    """
    try:
        dataset = load_dataset("cuad", "cuad_v1", split="train[:200]")
        df = pd.DataFrame(dataset)
        df["question_text"] = "Summarize the key legal clause: " + df["question_text"]
        df["answer"] = df["answers"].apply(lambda a: a[0]["text"][0] if a and a[0]["text"] else "")
        data = df[["question_text", "answer"]].rename(columns={"question_text": "question"})
    except Exception as exc:  # pragma: no cover - offline/sandbox fallback
        print(f"⚠️ Unable to load CUAD from Hub ({exc}). Using synthetic sample.")
        data = pd.DataFrame(
            [
                {
                    "question": "Summarize the confidentiality clause: The parties agree to keep all proprietary information confidential for five years.",
                    "answer": "Both parties must keep proprietary info secret for five years.",
                },
                {
                    "question": "Summarize the termination clause: Either party may terminate with 30 days written notice without cause.",
                    "answer": "Either side can end the agreement with 30 days written notice.",
                },
                {
                    "question": "Summarize the liability clause: Liability is limited to direct damages not exceeding fees paid in the last 12 months.",
                    "answer": "Each party's liability is capped to direct damages up to fees from the past year.",
                },
            ]
        )

    os.makedirs("datasets", exist_ok=True)
    data.to_json("datasets/legal_sample.jsonl", orient="records", lines=True)
    print("✅ Saved sample dataset to datasets/legal_sample.jsonl")
    return data

if __name__ == "__main__":
    load_legal_dataset()