import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces

# Model configuration
MODEL_NAME = "tencent/HY-MT1.5-1.8B"

# Global model and tokenizer instances
tokenizer = None
model = None

@spaces.GPU
def load_model():
    """Load the model and tokenizer with Zero-GPU support."""
    global tokenizer, model
    if tokenizer is None or model is None:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        # Load on GPU for Zero-GPU spaces
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            torch_dtype=torch.bfloat16  # Use bfloat16 for GPU efficiency
        )
    return tokenizer, model

@spaces.GPU(duration=120)
def generate_response(message: str, history: list, system_prompt: str = None) -> str:
    """
    Generate a response using the HY-MT1.5-1.8B model with chat template.
    Zero-GPU optimized with GPU acceleration.
    
    Args:
        message: The user's input message
        history: List of previous conversation messages
        system_prompt: Optional system prompt for the conversation
        
    Returns:
        The model's generated response
    """
    try:
        # Load model if not already loaded
        tokenizer, model = load_model()
        
        # Build messages list from history
        messages = []
        
        # Add system prompt if provided
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        
        # Add conversation history
        for msg in history:
            messages.append(msg)
        
        # Add current user message
        messages.append({"role": "user", "content": message})
        
        # Apply chat template and tokenize
        tokenized_chat = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,  # Add generation prompt for assistant turn
            return_tensors="pt"
        )
        
        # Move input to GPU
        input_ids = tokenized_chat.input_ids.to(model.device)
        
        # Generate response (GPU-optimized settings)
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=1024,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True  # Enable KV cache for GPU efficiency
            )
        
        # Decode the response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the assistant's response (after the user's message)
        if "assistant" in response:
            response = response.split("assistant")[-1].strip()
        elif "</s>" in response:
            response = response.split("</s>")[-1].strip()
        
        return response
        
    except Exception as e:
        return f"Error generating response: {str(e)}"

def create_conversation_message(role: str, content: str) -> dict:
    """Create a message dictionary for the conversation."""
    return {"role": role, "content": content}

# Create the Gradio 6 application
with gr.Blocks() as demo:
    # Application header with branding
    gr.Markdown(
        """
        # 🤖 HY-MT1.5-1.8B Chatbot
        
        A conversational AI powered by Tencent's HY-MT1.5-1.8B model.
        
        ---
        
        **Built with** [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
        """,
        elem_classes=["header"]
    )
    
    # Main chatbot interface
    chat_interface = gr.ChatInterface(
        fn=generate_response,
        title="",
        description="💬 Start a conversation below! The model responds to your messages using the HY-MT1.5-1.8B chat template.",
        chatbot=gr.Chatbot(
            placeholder="💭 How can I help you today?",
            height=400,
            avatar_images=(
                "https://huggingface.co/datasets/huggingface/avatars/resolve/main/user.png",
                "https://huggingface.co/datasets/huggingface/avatars/resolve/main/tencent.png"
            ),
            buttons=["share", "copy", "copy_all"],  # Gradio 6 uses 'buttons' parameter
            feedback_options=("👍", "👎")  # Gradio 6 uses 'feedback_options' for available options
        ),
        textbox=gr.MultimodalTextbox(
            placeholder="Type your message here...",
            lines=2,
            max_lines=10,
            submit_btn="Send ✈️",
            stop_btn="Stop ⏹️"
        ),
        additional_inputs=[
            gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="You are a helpful assistant...",
                lines=2,
                max_lines=4
            )
        ],
        additional_inputs_accordion=gr.Accordion(
            label="⚙️ Advanced Settings",
            open=False
        ),
        examples=[
            ["Translate 'Hello, how are you?' into French."],
            ["Explain quantum computing in simple terms."],
            ["Write a short poem about the ocean."],
            ["What are the benefits of exercise?"],
            ["Help me plan a trip to Japan."]
        ],
        example_labels=["French Translation", "Quantum Computing", "Ocean Poem", "Exercise Benefits", "Japan Trip"],
        submit_btn="Send ✈️",
        autofocus=True,
        fill_height=True,
        api_visibility="public"
    )
    
    # Model information section
    with gr.Accordion("📋 Model Information", open=False):
        gr.Markdown(f"""
        ### Model Details
        
        - **Model**: {MODEL_NAME}
        - **Type**: Causal Language Model with Chat Template
        - **Provider**: [Tencent](https://huggingface.co/tencent)
        - **Hardware**: Zero-GPU (NVIDIA H200)
        
        ### Capabilities
        
        - 📝 Text generation and completion
        - 🌍 Translation (supports multiple languages)
        - 💬 Conversational AI
        - 📖 Question answering
        - ✍️ Creative writing
        
        ### Usage Tips
        
        - Be clear and specific in your requests
        - For translations, specify the target language
        - Use system prompts to customize behavior
        - Model responds in the language of your query
        """)
    
    # Zero-GPU info section
    with gr.Accordion("🚀 Zero-GPU Information", open=True):
        gr.Markdown("""
        ### About Zero-GPU
        
        This application runs on Hugging Face Spaces with **Zero-GPU** configuration:
        
        - ✅ **Free GPU Access** - Dynamically allocated NVIDIA H200 GPUs
        - ⚡ **Fast Inference** - GPU-accelerated response generation
        - 💾 **70GB VRAM** - Available GPU memory per workload
        - 🆓 **Free to use** - Generous daily GPU quotas
        
        ### GPU Quotas (Daily)
        
        | Account Type | GPU Time | Priority |
        |--------------|----------|----------|
        | Unauthenticated | 2 min | Low |
        | Free | 3.5 min | Medium |
        | PRO | 25 min | Highest |
        | Enterprise | 45 min | Highest |
        
        ### Technical Specs
        
        - **GPU**: NVIDIA H200 slice (70GB VRAM)
        - **Precision**: bfloat16
        - **Duration Limit**: 120 seconds per request
        
        ### Tips for Best Experience
        
        - PRO users get x7 more daily usage
        - Shorter requests get higher queue priority
        - Model cached between requests for faster responses
        """)
    
    # Footer
    gr.Markdown(
        """
        ---
        
        *This application uses the HY-MT1.5-1.8B model from Hugging Face. 
        Powered by Zero-GPU. Responses are generated with GPU acceleration.*
        """,
        elem_classes=["footer"]
    )

# Launch the application with Gradio 6 configuration
demo.launch(
    theme=gr.themes.Soft(
        primary_hue="indigo",
        secondary_hue="blue",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
        text_size="lg",
        spacing_size="md",
        radius_size="md"
    ).set(
        button_primary_background_fill="*primary_600",
        button_primary_background_fill_hover="*primary_700",
        block_title_text_weight="600"
    ),
    css="""
    .header {
        text-align: center;
        padding: 20px;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        border-radius: 12px;
        margin-bottom: 20px;
    }
    .header h1 {
        color: white !important;
        margin-bottom: 10px;
    }
    .header a {
        color: #ffd700;
        font-weight: bold;
        text-decoration: none;
    }
    .header a:hover {
        text-decoration: underline;
    }
    .footer {
        text-align: center;
        color: #666;
        font-size: 0.9em;
        padding: 10px;
    }
    .gradio-container {
        max-width: 1200px !important;
        margin: 0 auto;
    }
    """,
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "HY-MT1.5-1.8B", "url": "https://huggingface.co/tencent/HY-MT1.5-1.8B"},
        {"label": "Tencent", "url": "https://huggingface.co/tencent"},
        {"label": "Zero-GPU Docs", "url": "https://huggingface.co/docs/spaces/runtimes/zero"}
    ],
    height=750,
    width="100%"
)