import gradio as gr import os from model2vec import StaticModel # Suppress tokenizer warnings os.environ["TOKENIZERS_PARALLELISM"] = "false" # Best working static model – ultra-fast on CPU + long texts model = StaticModel.from_pretrained("minishlab/potion-base-32M") def generate_embedding(text: str): """Single text input → one embedding vector (fast even for 500–1000+ tokens).""" if not text or not text.strip(): return { "embedding": [], "text": "", "dimension": 256, "note": "Empty input" } cleaned_text = text.strip() # Static Model2Vec – no query/document prompt needed (always high-quality) embedding = model.encode( [cleaned_text], convert_to_numpy=True, normalize_embeddings=True, # ready for cosine similarity )[0].tolist() return { "embedding": embedding, # single list of 256 floats } # Clean single-text Gradio interface + full REST API demo = gr.Interface( fn=generate_embedding, inputs=gr.Textbox( lines=12, placeholder="Paste your text here (500–1000+ tokens works instantly)...", label="Input Text", ), outputs=gr.JSON(label="Embedding Response"), title="⚡ Qwen3-Style Fast Embedding API (Single Text)", description="""Ultra-fast static embedding model (potion-base-32M). Best reliable CPU option • 500× faster than transformers • Handles long texts instantly. Returns **one** 256-dim embedding vector per call.""", examples=[ ["What is the capital of France? Explain it in detail with historical context and why it matters today."], ["A very long document with many tokens to test speed... " * 50], ], ) demo.launch()