create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 15

Commit

87153fd

verified ·

1 Parent(s): 97901ed

Update train.py

Browse files

Files changed (1) hide show

train.py +48 -22

train.py CHANGED Viewed

@@ -26,35 +26,61 @@ def load_model(model_name="datalab-to/chandra", device_id=0):
 def caption_batch(batch, processor, model):
     images = batch["image"]
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {
-                    "type": "text",
-                    "text": "Describe the image, and skip mentioning that it's illustrated or from anime.",
-                },
-            ],
-        }
-        for image in images
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt",
     ).to(model.device)
     with torch.no_grad():
-        generated = model.generate(**inputs)
     decoded = processor.batch_decode(generated)
-    captions = [d.split("<|im_start|>assistant\n")[-1] for d in decoded]
-    return {"image": images, "text": captions}
 # %%
 import datasets

 def caption_batch(batch, processor, model):
     images = batch["image"]
+    encoded_list = []
+    for image in images:
+        msg = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {
+                        "type": "text",
+                        "text": "Describe the image, and skip mentioning that it's illustrated or from anime.",
+                    },
+                ],
+            }
+        ]
+        enc = processor.apply_chat_template(
+            msg,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        encoded_list.append(enc)
+    input_ids = torch.nn.utils.rnn.pad_sequence(
+        [e.input_ids[0] for e in encoded_list],
+        batch_first=True,
+        padding_value=processor.tokenizer.pad_token_id,
+    ).to(model.device)
+    attention_mask = torch.nn.utils.rnn.pad_sequence(
+        [e.attention_mask[0] for e in encoded_list],
+        batch_first=True,
+        padding_value=0,
     ).to(model.device)
     with torch.no_grad():
+        generated = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
     decoded = processor.batch_decode(generated)
+    captions = []
+    for d in decoded:
+        if "<|im_start|>assistant" in d:
+            d = d.split("<|im_start|>assistant")[-1].strip()
+        captions.append(d)
+    return {
+        "image": images,
+        "text": captions,
+    }
 # %%
 import datasets