akhaliq's picture
akhaliq HF Staff
Update app.py from anycoder
8c26f2e verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces
# Model configuration
MODEL_NAME = "tencent/HY-MT1.5-1.8B"
# Global model and tokenizer instances
tokenizer = None
model = None
@spaces.GPU
def load_model():
"""Load the model and tokenizer with Zero-GPU support."""
global tokenizer, model
if tokenizer is None or model is None:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load on GPU for Zero-GPU spaces
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
device_map="auto",
torch_dtype=torch.bfloat16 # Use bfloat16 for GPU efficiency
)
return tokenizer, model
@spaces.GPU(duration=120)
def generate_response(message: str, history: list, system_prompt: str = None) -> str:
"""
Generate a response using the HY-MT1.5-1.8B model with chat template.
Zero-GPU optimized with GPU acceleration.
Args:
message: The user's input message
history: List of previous conversation messages
system_prompt: Optional system prompt for the conversation
Returns:
The model's generated response
"""
try:
# Load model if not already loaded
tokenizer, model = load_model()
# Build messages list from history
messages = []
# Add system prompt if provided
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# Add conversation history
for msg in history:
messages.append(msg)
# Add current user message
messages.append({"role": "user", "content": message})
# Apply chat template and tokenize
tokenized_chat = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True, # Add generation prompt for assistant turn
return_tensors="pt"
)
# Move input to GPU
input_ids = tokenized_chat.input_ids.to(model.device)
# Generate response (GPU-optimized settings)
with torch.no_grad():
outputs = model.generate(
input_ids,
max_new_tokens=1024,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
use_cache=True # Enable KV cache for GPU efficiency
)
# Decode the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the assistant's response (after the user's message)
if "assistant" in response:
response = response.split("assistant")[-1].strip()
elif "</s>" in response:
response = response.split("</s>")[-1].strip()
return response
except Exception as e:
return f"Error generating response: {str(e)}"
def create_conversation_message(role: str, content: str) -> dict:
"""Create a message dictionary for the conversation."""
return {"role": role, "content": content}
# Create the Gradio 6 application
with gr.Blocks() as demo:
# Application header with branding
gr.Markdown(
"""
# πŸ€– HY-MT1.5-1.8B Chatbot
A conversational AI powered by Tencent's HY-MT1.5-1.8B model.
---
**Built with** [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
""",
elem_classes=["header"]
)
# Main chatbot interface
chat_interface = gr.ChatInterface(
fn=generate_response,
title="",
description="πŸ’¬ Start a conversation below! The model responds to your messages using the HY-MT1.5-1.8B chat template.",
chatbot=gr.Chatbot(
placeholder="πŸ’­ How can I help you today?",
height=400,
avatar_images=(
"https://huggingface.co/datasets/huggingface/avatars/resolve/main/user.png",
"https://huggingface.co/datasets/huggingface/avatars/resolve/main/tencent.png"
),
buttons=["share", "copy", "copy_all"], # Gradio 6 uses 'buttons' parameter
feedback_options=("πŸ‘", "πŸ‘Ž") # Gradio 6 uses 'feedback_options' for available options
),
textbox=gr.MultimodalTextbox(
placeholder="Type your message here...",
lines=2,
max_lines=10,
submit_btn="Send ✈️",
stop_btn="Stop ⏹️"
),
additional_inputs=[
gr.Textbox(
label="System Prompt (Optional)",
placeholder="You are a helpful assistant...",
lines=2,
max_lines=4
)
],
additional_inputs_accordion=gr.Accordion(
label="βš™οΈ Advanced Settings",
open=False
),
examples=[
["Translate 'Hello, how are you?' into French."],
["Explain quantum computing in simple terms."],
["Write a short poem about the ocean."],
["What are the benefits of exercise?"],
["Help me plan a trip to Japan."]
],
example_labels=["French Translation", "Quantum Computing", "Ocean Poem", "Exercise Benefits", "Japan Trip"],
submit_btn="Send ✈️",
autofocus=True,
fill_height=True,
api_visibility="public"
)
# Model information section
with gr.Accordion("πŸ“‹ Model Information", open=False):
gr.Markdown(f"""
### Model Details
- **Model**: {MODEL_NAME}
- **Type**: Causal Language Model with Chat Template
- **Provider**: [Tencent](https://huggingface.co/tencent)
- **Hardware**: Zero-GPU (NVIDIA H200)
### Capabilities
- πŸ“ Text generation and completion
- 🌍 Translation (supports multiple languages)
- πŸ’¬ Conversational AI
- πŸ“– Question answering
- ✍️ Creative writing
### Usage Tips
- Be clear and specific in your requests
- For translations, specify the target language
- Use system prompts to customize behavior
- Model responds in the language of your query
""")
# Zero-GPU info section
with gr.Accordion("πŸš€ Zero-GPU Information", open=True):
gr.Markdown("""
### About Zero-GPU
This application runs on Hugging Face Spaces with **Zero-GPU** configuration:
- βœ… **Free GPU Access** - Dynamically allocated NVIDIA H200 GPUs
- ⚑ **Fast Inference** - GPU-accelerated response generation
- πŸ’Ύ **70GB VRAM** - Available GPU memory per workload
- πŸ†“ **Free to use** - Generous daily GPU quotas
### GPU Quotas (Daily)
| Account Type | GPU Time | Priority |
|--------------|----------|----------|
| Unauthenticated | 2 min | Low |
| Free | 3.5 min | Medium |
| PRO | 25 min | Highest |
| Enterprise | 45 min | Highest |
### Technical Specs
- **GPU**: NVIDIA H200 slice (70GB VRAM)
- **Precision**: bfloat16
- **Duration Limit**: 120 seconds per request
### Tips for Best Experience
- PRO users get x7 more daily usage
- Shorter requests get higher queue priority
- Model cached between requests for faster responses
""")
# Footer
gr.Markdown(
"""
---
*This application uses the HY-MT1.5-1.8B model from Hugging Face.
Powered by Zero-GPU. Responses are generated with GPU acceleration.*
""",
elem_classes=["footer"]
)
# Launch the application with Gradio 6 configuration
demo.launch(
theme=gr.themes.Soft(
primary_hue="indigo",
secondary_hue="blue",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="lg",
spacing_size="md",
radius_size="md"
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600"
),
css="""
.header {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 12px;
margin-bottom: 20px;
}
.header h1 {
color: white !important;
margin-bottom: 10px;
}
.header a {
color: #ffd700;
font-weight: bold;
text-decoration: none;
}
.header a:hover {
text-decoration: underline;
}
.footer {
text-align: center;
color: #666;
font-size: 0.9em;
padding: 10px;
}
.gradio-container {
max-width: 1200px !important;
margin: 0 auto;
}
""",
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "HY-MT1.5-1.8B", "url": "https://huggingface.co/tencent/HY-MT1.5-1.8B"},
{"label": "Tencent", "url": "https://huggingface.co/tencent"},
{"label": "Zero-GPU Docs", "url": "https://huggingface.co/docs/spaces/runtimes/zero"}
],
height=750,
width="100%"
)