from typing import Dict from transformers import AutoProcessor, AutoModelForConditionalGeneration from PIL import Image import torch import base64 import io class EndpointHandler: def __init__(self, path=""): model_id = path if path else "Qwen/Qwen2.5-VL-7B-Instruct" self.processor = AutoProcessor.from_pretrained(model_id) self.model = AutoModelForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto" ) def __call__(self, data: Dict[str, any]) -> Dict[str, str]: """ data = { "inputs": { "text": "Describe this image", "image": "" # optional } } """ inputs = {} if "text" in data["inputs"]: inputs["text"] = data["inputs"]["text"] if "image" in data["inputs"]: # Bild von Base64 in PIL umwandeln image_bytes = base64.b64decode(data["inputs"]["image"]) image = Image.open(io.BytesIO(image_bytes)).convert("RGB") inputs["images"] = image proc_inputs = self.processor(**inputs, return_tensors="pt").to(self.model.device) output_ids = self.model.generate(**proc_inputs, max_new_tokens=200) result = self.processor.batch_decode(output_ids, skip_special_tokens=True) return {"generated_text": result[0]}