import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import spaces # Model configuration MODEL_NAME = "tencent/HY-MT1.5-1.8B" # Global model and tokenizer instances tokenizer = None model = None @spaces.GPU def load_model(): """Load the model and tokenizer with Zero-GPU support.""" global tokenizer, model if tokenizer is None or model is None: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Load on GPU for Zero-GPU spaces model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16 # Use bfloat16 for GPU efficiency ) return tokenizer, model @spaces.GPU(duration=120) def generate_response(message: str, history: list, system_prompt: str = None) -> str: """ Generate a response using the HY-MT1.5-1.8B model with chat template. Zero-GPU optimized with GPU acceleration. Args: message: The user's input message history: List of previous conversation messages system_prompt: Optional system prompt for the conversation Returns: The model's generated response """ try: # Load model if not already loaded tokenizer, model = load_model() # Build messages list from history messages = [] # Add system prompt if provided if system_prompt: messages.append({"role": "system", "content": system_prompt}) # Add conversation history for msg in history: messages.append(msg) # Add current user message messages.append({"role": "user", "content": message}) # Apply chat template and tokenize tokenized_chat = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, # Add generation prompt for assistant turn return_tensors="pt" ) # Move input to GPU input_ids = tokenized_chat.input_ids.to(model.device) # Generate response (GPU-optimized settings) with torch.no_grad(): outputs = model.generate( input_ids, max_new_tokens=1024, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id, use_cache=True # Enable KV cache for GPU efficiency ) # Decode the response response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response (after the user's message) if "assistant" in response: response = response.split("assistant")[-1].strip() elif "" in response: response = response.split("")[-1].strip() return response except Exception as e: return f"Error generating response: {str(e)}" def create_conversation_message(role: str, content: str) -> dict: """Create a message dictionary for the conversation.""" return {"role": role, "content": content} # Create the Gradio 6 application with gr.Blocks() as demo: # Application header with branding gr.Markdown( """ # 🤖 HY-MT1.5-1.8B Chatbot A conversational AI powered by Tencent's HY-MT1.5-1.8B model. --- **Built with** [anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """, elem_classes=["header"] ) # Main chatbot interface chat_interface = gr.ChatInterface( fn=generate_response, title="", description="đŸ’Ŧ Start a conversation below! The model responds to your messages using the HY-MT1.5-1.8B chat template.", chatbot=gr.Chatbot( placeholder="💭 How can I help you today?", height=400, avatar_images=( "https://huggingface.co/datasets/huggingface/avatars/resolve/main/user.png", "https://huggingface.co/datasets/huggingface/avatars/resolve/main/tencent.png" ), buttons=["share", "copy", "copy_all"], # Gradio 6 uses 'buttons' parameter feedback_options=("👍", "👎") # Gradio 6 uses 'feedback_options' for available options ), textbox=gr.MultimodalTextbox( placeholder="Type your message here...", lines=2, max_lines=10, submit_btn="Send âœˆī¸", stop_btn="Stop âšī¸" ), additional_inputs=[ gr.Textbox( label="System Prompt (Optional)", placeholder="You are a helpful assistant...", lines=2, max_lines=4 ) ], additional_inputs_accordion=gr.Accordion( label="âš™ī¸ Advanced Settings", open=False ), examples=[ ["Translate 'Hello, how are you?' into French."], ["Explain quantum computing in simple terms."], ["Write a short poem about the ocean."], ["What are the benefits of exercise?"], ["Help me plan a trip to Japan."] ], example_labels=["French Translation", "Quantum Computing", "Ocean Poem", "Exercise Benefits", "Japan Trip"], submit_btn="Send âœˆī¸", autofocus=True, fill_height=True, api_visibility="public" ) # Model information section with gr.Accordion("📋 Model Information", open=False): gr.Markdown(f""" ### Model Details - **Model**: {MODEL_NAME} - **Type**: Causal Language Model with Chat Template - **Provider**: [Tencent](https://huggingface.co/tencent) - **Hardware**: Zero-GPU (NVIDIA H200) ### Capabilities - 📝 Text generation and completion - 🌍 Translation (supports multiple languages) - đŸ’Ŧ Conversational AI - 📖 Question answering - âœī¸ Creative writing ### Usage Tips - Be clear and specific in your requests - For translations, specify the target language - Use system prompts to customize behavior - Model responds in the language of your query """) # Zero-GPU info section with gr.Accordion("🚀 Zero-GPU Information", open=True): gr.Markdown(""" ### About Zero-GPU This application runs on Hugging Face Spaces with **Zero-GPU** configuration: - ✅ **Free GPU Access** - Dynamically allocated NVIDIA H200 GPUs - ⚡ **Fast Inference** - GPU-accelerated response generation - 💾 **70GB VRAM** - Available GPU memory per workload - 🆓 **Free to use** - Generous daily GPU quotas ### GPU Quotas (Daily) | Account Type | GPU Time | Priority | |--------------|----------|----------| | Unauthenticated | 2 min | Low | | Free | 3.5 min | Medium | | PRO | 25 min | Highest | | Enterprise | 45 min | Highest | ### Technical Specs - **GPU**: NVIDIA H200 slice (70GB VRAM) - **Precision**: bfloat16 - **Duration Limit**: 120 seconds per request ### Tips for Best Experience - PRO users get x7 more daily usage - Shorter requests get higher queue priority - Model cached between requests for faster responses """) # Footer gr.Markdown( """ --- *This application uses the HY-MT1.5-1.8B model from Hugging Face. Powered by Zero-GPU. Responses are generated with GPU acceleration.* """, elem_classes=["footer"] ) # Launch the application with Gradio 6 configuration demo.launch( theme=gr.themes.Soft( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="md", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600" ), css=""" .header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; margin-bottom: 20px; } .header h1 { color: white !important; margin-bottom: 10px; } .header a { color: #ffd700; font-weight: bold; text-decoration: none; } .header a:hover { text-decoration: underline; } .footer { text-align: center; color: #666; font-size: 0.9em; padding: 10px; } .gradio-container { max-width: 1200px !important; margin: 0 auto; } """, footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "HY-MT1.5-1.8B", "url": "https://huggingface.co/tencent/HY-MT1.5-1.8B"}, {"label": "Tencent", "url": "https://huggingface.co/tencent"}, {"label": "Zero-GPU Docs", "url": "https://huggingface.co/docs/spaces/runtimes/zero"} ], height=750, width="100%" )