import gradio as gr
import os
from model2vec import StaticModel

# Suppress tokenizer warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Best working static model – ultra-fast on CPU + long texts
model = StaticModel.from_pretrained("minishlab/potion-base-32M")

def generate_embedding(text: str):
    """Single text input → one embedding vector (fast even for 500–1000+ tokens)."""
    if not text or not text.strip():
        return {
            "embedding": [],
            "text": "",
            "dimension": 256,
            "note": "Empty input"
        }

    cleaned_text = text.strip()

    # Static Model2Vec – no query/document prompt needed (always high-quality)
    embedding = model.encode(
        [cleaned_text],
        convert_to_numpy=True,
        normalize_embeddings=True,   # ready for cosine similarity
    )[0].tolist()

    return {
        "embedding": embedding,      # single list of 256 floats
    }


# Clean single-text Gradio interface + full REST API
demo = gr.Interface(
    fn=generate_embedding,
    inputs=gr.Textbox(
        lines=12,
        placeholder="Paste your text here (500–1000+ tokens works instantly)...",
        label="Input Text",
    ),
    outputs=gr.JSON(label="Embedding Response"),
    title="⚡ Qwen3-Style Fast Embedding API (Single Text)",
    description="""Ultra-fast static embedding model (potion-base-32M).
    Best reliable CPU option • 500× faster than transformers • Handles long texts instantly.
    Returns **one** 256-dim embedding vector per call.""",
    examples=[
        ["What is the capital of France? Explain it in detail with historical context and why it matters today."],
        ["A very long document with many tokens to test speed... " * 50],
    ],
)

demo.launch()