Spaces:

cafe3310
/

ling-playground-basic

Sleeping

File size: 2,340 Bytes

9602bb7
 
 
551e9e2
aa73b52
551e9e2
 
 
 
 
 
 
 
 
 
9602bb7
 
aa73b52
 
 
 
9602bb7
aa73b52
9602bb7
 
 
aa73b52
9602bb7
 
 
 
aa73b52
9602bb7
 
 
aa73b52
9602bb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551e9e2
9602bb7
551e9e2
 
aa73b52
9602bb7
 
 
aa73b52
9602bb7
 
 
 
 
aa73b52
9602bb7

from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import spaces

# Model and tokenizer initialization
MODEL_NAME = "inclusionAI/Ring-mini-2.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True
)

@spaces.GPU(duration=120)
def generate_response(message, history):
    # (msg, history) -> str: stream response (yielding partial responses)
    
    # To construct the 'chat', we start with system prompt
    # then append user and assistant messages from history
    messages = [
        {"role": "system", "content": "你是 Ring，蚂蚁集团开发的智能助手，致力于为用户提供有用的信息和帮助，用中文回答用户的问题。"}
    ]
    
    # Add conversation history
    # history is a list of (human, assistant) tuples
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})
    
    # Add current message from user
    messages.append({"role": "user", "content": message})
    
    # Apply chat template
    # Doc: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L1510
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize input
    model_inputs = tokenizer([text], return_tensors="pt", return_token_type_ids=False).to(model.device)
    
    # Generate response with streaming
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
    
    generation_kwargs = dict(
        **model_inputs,
        max_new_tokens=8192,
        temperature=0.7,
        do_sample=True,
        streamer=streamer,
    )
    
    # Start generation in a separate thread to enable streaming
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # ... and yield the generated tokens as they are produced
    response = ""
    for new_text in streamer:
        response += new_text
        yield response
    
    # wait for the generation thread to finish
    thread.join()