from typing import Dict
from transformers import AutoProcessor, AutoModelForConditionalGeneration
from PIL import Image
import torch
import base64
import io

class EndpointHandler:
    def __init__(self, path=""):
        model_id = path if path else "Qwen/Qwen2.5-VL-7B-Instruct"
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = AutoModelForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )

    def __call__(self, data: Dict[str, any]) -> Dict[str, str]:
        """
        data = {
          "inputs": {
            "text": "Describe this image",
            "image": "<base64-encoded image>"   # optional
          }
        }
        """
        inputs = {}
        if "text" in data["inputs"]:
            inputs["text"] = data["inputs"]["text"]

        if "image" in data["inputs"]:
            # Bild von Base64 in PIL umwandeln
            image_bytes = base64.b64decode(data["inputs"]["image"])
            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            inputs["images"] = image

        proc_inputs = self.processor(**inputs, return_tensors="pt").to(self.model.device)
        output_ids = self.model.generate(**proc_inputs, max_new_tokens=200)
        result = self.processor.batch_decode(output_ids, skip_special_tokens=True)
        return {"generated_text": result[0]}