Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import spaces | |
| # Model configuration | |
| MODEL_NAME = "tencent/HY-MT1.5-1.8B" | |
| # Global model and tokenizer instances | |
| tokenizer = None | |
| model = None | |
| def load_model(): | |
| """Load the model and tokenizer with Zero-GPU support.""" | |
| global tokenizer, model | |
| if tokenizer is None or model is None: | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Load on GPU for Zero-GPU spaces | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 # Use bfloat16 for GPU efficiency | |
| ) | |
| return tokenizer, model | |
| def generate_response(message: str, history: list, system_prompt: str = None) -> str: | |
| """ | |
| Generate a response using the HY-MT1.5-1.8B model with chat template. | |
| Zero-GPU optimized with GPU acceleration. | |
| Args: | |
| message: The user's input message | |
| history: List of previous conversation messages | |
| system_prompt: Optional system prompt for the conversation | |
| Returns: | |
| The model's generated response | |
| """ | |
| try: | |
| # Load model if not already loaded | |
| tokenizer, model = load_model() | |
| # Build messages list from history | |
| messages = [] | |
| # Add system prompt if provided | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| # Add conversation history | |
| for msg in history: | |
| messages.append(msg) | |
| # Add current user message | |
| messages.append({"role": "user", "content": message}) | |
| # Apply chat template and tokenize | |
| tokenized_chat = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, # Add generation prompt for assistant turn | |
| return_tensors="pt" | |
| ) | |
| # Move input to GPU | |
| input_ids = tokenized_chat.input_ids.to(model.device) | |
| # Generate response (GPU-optimized settings) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| input_ids, | |
| max_new_tokens=1024, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| use_cache=True # Enable KV cache for GPU efficiency | |
| ) | |
| # Decode the response | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only the assistant's response (after the user's message) | |
| if "assistant" in response: | |
| response = response.split("assistant")[-1].strip() | |
| elif "</s>" in response: | |
| response = response.split("</s>")[-1].strip() | |
| return response | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}" | |
| def create_conversation_message(role: str, content: str) -> dict: | |
| """Create a message dictionary for the conversation.""" | |
| return {"role": role, "content": content} | |
| # Create the Gradio 6 application | |
| with gr.Blocks() as demo: | |
| # Application header with branding | |
| gr.Markdown( | |
| """ | |
| # π€ HY-MT1.5-1.8B Chatbot | |
| A conversational AI powered by Tencent's HY-MT1.5-1.8B model. | |
| --- | |
| **Built with** [anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
| """, | |
| elem_classes=["header"] | |
| ) | |
| # Main chatbot interface | |
| chat_interface = gr.ChatInterface( | |
| fn=generate_response, | |
| title="", | |
| description="π¬ Start a conversation below! The model responds to your messages using the HY-MT1.5-1.8B chat template.", | |
| chatbot=gr.Chatbot( | |
| placeholder="π How can I help you today?", | |
| height=400, | |
| avatar_images=( | |
| "https://huggingface.co/datasets/huggingface/avatars/resolve/main/user.png", | |
| "https://huggingface.co/datasets/huggingface/avatars/resolve/main/tencent.png" | |
| ), | |
| buttons=["share", "copy", "copy_all"], # Gradio 6 uses 'buttons' parameter | |
| feedback_options=("π", "π") # Gradio 6 uses 'feedback_options' for available options | |
| ), | |
| textbox=gr.MultimodalTextbox( | |
| placeholder="Type your message here...", | |
| lines=2, | |
| max_lines=10, | |
| submit_btn="Send βοΈ", | |
| stop_btn="Stop βΉοΈ" | |
| ), | |
| additional_inputs=[ | |
| gr.Textbox( | |
| label="System Prompt (Optional)", | |
| placeholder="You are a helpful assistant...", | |
| lines=2, | |
| max_lines=4 | |
| ) | |
| ], | |
| additional_inputs_accordion=gr.Accordion( | |
| label="βοΈ Advanced Settings", | |
| open=False | |
| ), | |
| examples=[ | |
| ["Translate 'Hello, how are you?' into French."], | |
| ["Explain quantum computing in simple terms."], | |
| ["Write a short poem about the ocean."], | |
| ["What are the benefits of exercise?"], | |
| ["Help me plan a trip to Japan."] | |
| ], | |
| example_labels=["French Translation", "Quantum Computing", "Ocean Poem", "Exercise Benefits", "Japan Trip"], | |
| submit_btn="Send βοΈ", | |
| autofocus=True, | |
| fill_height=True, | |
| api_visibility="public" | |
| ) | |
| # Model information section | |
| with gr.Accordion("π Model Information", open=False): | |
| gr.Markdown(f""" | |
| ### Model Details | |
| - **Model**: {MODEL_NAME} | |
| - **Type**: Causal Language Model with Chat Template | |
| - **Provider**: [Tencent](https://huggingface.co/tencent) | |
| - **Hardware**: Zero-GPU (NVIDIA H200) | |
| ### Capabilities | |
| - π Text generation and completion | |
| - π Translation (supports multiple languages) | |
| - π¬ Conversational AI | |
| - π Question answering | |
| - βοΈ Creative writing | |
| ### Usage Tips | |
| - Be clear and specific in your requests | |
| - For translations, specify the target language | |
| - Use system prompts to customize behavior | |
| - Model responds in the language of your query | |
| """) | |
| # Zero-GPU info section | |
| with gr.Accordion("π Zero-GPU Information", open=True): | |
| gr.Markdown(""" | |
| ### About Zero-GPU | |
| This application runs on Hugging Face Spaces with **Zero-GPU** configuration: | |
| - β **Free GPU Access** - Dynamically allocated NVIDIA H200 GPUs | |
| - β‘ **Fast Inference** - GPU-accelerated response generation | |
| - πΎ **70GB VRAM** - Available GPU memory per workload | |
| - π **Free to use** - Generous daily GPU quotas | |
| ### GPU Quotas (Daily) | |
| | Account Type | GPU Time | Priority | | |
| |--------------|----------|----------| | |
| | Unauthenticated | 2 min | Low | | |
| | Free | 3.5 min | Medium | | |
| | PRO | 25 min | Highest | | |
| | Enterprise | 45 min | Highest | | |
| ### Technical Specs | |
| - **GPU**: NVIDIA H200 slice (70GB VRAM) | |
| - **Precision**: bfloat16 | |
| - **Duration Limit**: 120 seconds per request | |
| ### Tips for Best Experience | |
| - PRO users get x7 more daily usage | |
| - Shorter requests get higher queue priority | |
| - Model cached between requests for faster responses | |
| """) | |
| # Footer | |
| gr.Markdown( | |
| """ | |
| --- | |
| *This application uses the HY-MT1.5-1.8B model from Hugging Face. | |
| Powered by Zero-GPU. Responses are generated with GPU acceleration.* | |
| """, | |
| elem_classes=["footer"] | |
| ) | |
| # Launch the application with Gradio 6 configuration | |
| demo.launch( | |
| theme=gr.themes.Soft( | |
| primary_hue="indigo", | |
| secondary_hue="blue", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="md", | |
| radius_size="md" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600" | |
| ), | |
| css=""" | |
| .header { | |
| text-align: center; | |
| padding: 20px; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| border-radius: 12px; | |
| margin-bottom: 20px; | |
| } | |
| .header h1 { | |
| color: white !important; | |
| margin-bottom: 10px; | |
| } | |
| .header a { | |
| color: #ffd700; | |
| font-weight: bold; | |
| text-decoration: none; | |
| } | |
| .header a:hover { | |
| text-decoration: underline; | |
| } | |
| .footer { | |
| text-align: center; | |
| color: #666; | |
| font-size: 0.9em; | |
| padding: 10px; | |
| } | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: 0 auto; | |
| } | |
| """, | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "HY-MT1.5-1.8B", "url": "https://huggingface.co/tencent/HY-MT1.5-1.8B"}, | |
| {"label": "Tencent", "url": "https://huggingface.co/tencent"}, | |
| {"label": "Zero-GPU Docs", "url": "https://huggingface.co/docs/spaces/runtimes/zero"} | |
| ], | |
| height=750, | |
| width="100%" | |
| ) |