Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| from transformers import AutoTokenizer, pipeline | |
| MODEL_ID = "Equall/Saul-7B-Instruct-v1" | |
| print("Loading model... this can take a while on first start.") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| pipe = pipeline( | |
| "text-generation", | |
| model=MODEL_ID, | |
| tokenizer=tokenizer, | |
| device=-1, # CPU only | |
| max_new_tokens=512, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| app = FastAPI() | |
| class ChatMessage(BaseModel): | |
| role: str # "system" | "user" | "assistant" | |
| content: str | |
| class ChatRequest(BaseModel): | |
| model: Optional[str] = None # ignored, OpenAI-style compat | |
| messages: List[ChatMessage] | |
| temperature: Optional[float] = 0.0 | |
| max_tokens: Optional[int] = 512 | |
| def root(): | |
| return {"status": "ok", "model": MODEL_ID} | |
| def build_prompt(raw_messages: List[dict]) -> str: | |
| """ | |
| Normalize messages so they fit the template: | |
| - Collect system messages and prepend their text to the first user message. | |
| - Drop leading assistant messages. | |
| - Merge consecutive messages with the same role. | |
| - Ensure we end up with user/assistant/user/assistant/... only. | |
| """ | |
| system_parts = [] | |
| ua_messages = [] | |
| # Separate system vs user/assistant | |
| for m in raw_messages: | |
| role = m.get("role") | |
| content = m.get("content", "") | |
| if role == "system": | |
| if content: | |
| system_parts.append(content) | |
| elif role in ("user", "assistant"): | |
| ua_messages.append({"role": role, "content": content}) | |
| # ignore anything else | |
| # Drop leading assistants (template wants to start with user) | |
| while ua_messages and ua_messages[0]["role"] != "user": | |
| ua_messages.pop(0) | |
| # Merge consecutive messages with same role | |
| normalized: List[dict] = [] | |
| for m in ua_messages: | |
| if not normalized: | |
| normalized.append(m) | |
| else: | |
| if normalized[-1]["role"] == m["role"]: | |
| normalized[-1]["content"] += "\n\n" + m["content"] | |
| else: | |
| normalized.append(m) | |
| if not normalized: | |
| raise ValueError("No user messages found after normalization.") | |
| # Prepend system text into the first user message, if any | |
| if system_parts: | |
| system_text = "\n\n".join(system_parts) | |
| if normalized[0]["role"] == "user": | |
| normalized[0]["content"] = system_text + "\n\n" + normalized[0]["content"] | |
| else: | |
| # If for some reason first is assistant, prepend a synthetic user | |
| normalized.insert(0, {"role": "user", "content": system_text}) | |
| # At this point we should only have user/assistant alternating. | |
| # Let tokenizer.apply_chat_template enforce the exact format. | |
| prompt = tokenizer.apply_chat_template( | |
| normalized, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| return prompt | |
| async def debug_echo(request: ChatRequest): | |
| body = await request.body() | |
| print("DEBUG ECHO BODY:", body) | |
| return {"ok": True} | |
| def chat(request: ChatRequest): | |
| try: | |
| messages = [m.dict() for m in request.messages] | |
| prompt = build_prompt(messages) | |
| except Exception as e: | |
| # Don't crash the app – return a 400 with explanation | |
| raise HTTPException(status_code=400, detail=f"Invalid message history: {e}") | |
| outputs = pipe( | |
| prompt, | |
| max_new_tokens=request.max_tokens or 512, | |
| do_sample=(request.temperature or 0.0) > 0, | |
| temperature=request.temperature or 0.0, | |
| top_p=1.0, | |
| ) | |
| full = outputs[0]["generated_text"] | |
| reply = full[len(prompt):].strip() | |
| return { | |
| "id": "chatcmpl-1", | |
| "object": "chat.completion", | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": reply, | |
| }, | |
| "finish_reason": "stop", | |
| } | |
| ], | |
| } | |