create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 16

Commit

7927c3b

verified ·

1 Parent(s): 8412424

Update train.py

Browse files

Files changed (1) hide show

train.py +39 -44

train.py CHANGED Viewed

@@ -14,7 +14,7 @@ def load_model(model_name, device_id=0):
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
     )
     processor = AutoProcessor.from_pretrained(model_name)
@@ -24,18 +24,16 @@ def load_model(model_name, device_id=0):
         quantization_config=bnb_config,
         dtype=torch.bfloat16,
         device_map={"": device_id},
     )
     return processor, model
-processed_count = 0
 def caption_batch(batch, processor, model):
-    global processed_count
     images = batch["image"]
     pil_images = []
     for image in images:
         if not isinstance(image, Image.Image):
@@ -44,56 +42,51 @@ def caption_batch(batch, processor, model):
             image = image.convert("RGB")
         pil_images.append(image)
-    messages_list = []
-    for pil_image in pil_images:
-        msg = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": "Describe the image, and skip mentioning that it's illustrated or from anime."},
-                ],
-            }
-        ]
-        messages_list.append(msg)
-    texts = processor.apply_chat_template(messages_list, add_generation_prompt=True, tokenize=False)
-    inputs = processor(
-        text=texts,
-        images=pil_images,
-        return_tensors="pt",
-        padding=True
     )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
         generated = model.generate(
             **inputs,
-            max_new_tokens=256,
         )
     decoded = processor.batch_decode(generated, skip_special_tokens=False)
     captions = []
     for d in decoded:
         if "<|im_start|>assistant" in d:
-            d = d.split("<|im_start|>assistant")[-1].strip()
-        special_tokens = set(processor.tokenizer.all_special_tokens)
         for token in special_tokens:
             d = d.replace(token, "")
         d = d.strip()
         captions.append(d)
-    processed_count += len(images)
-    if processed_count > 100:
-        print(f"Processed {processed_count} examples so far...")
     return {
-        "image": images,
         "text": captions,
     }
@@ -101,9 +94,6 @@ def caption_batch(batch, processor, model):
 def process_shard_worker(
     gpu_id, start, end, model_name, batch_size, input_dataset, output_file
 ):
-    global processed_count
-    processed_count = 0
     torch.cuda.set_device(gpu_id)
     print(f"[GPU {gpu_id}] Loading model...", flush=True)
@@ -117,12 +107,17 @@ def process_shard_worker(
     else:
         shard = cast(Dataset, loaded)
     print(f"[GPU {gpu_id}] Processing {len(shard)} examples...", flush=True)
     result = shard.map(
         lambda batch: caption_batch(batch, processor, model),
         batched=True,
         batch_size=batch_size,
-        remove_columns=shard.column_names,
     )
     print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)
@@ -134,9 +129,9 @@ def process_shard_worker(
 def main():
     input_dataset = "none-yet/anime-captions"
-    output_dataset = input_dataset
     model_name = "datalab-to/chandra"
-    batch_size = 12
     print("Loading dataset info...")
     loaded = datasets.load_dataset(input_dataset, split="train")
@@ -182,7 +177,7 @@ def main():
     print(f"Final dataset size: {len(final_ds)}")
     print("Pushing to hub...")
-    final_ds.push_to_hub(output_dataset, create_pr=True)
     print("Cleaning up temporary files...")
     for f in temp_files:

         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=False,
     )
     processor = AutoProcessor.from_pretrained(model_name)
         quantization_config=bnb_config,
         dtype=torch.bfloat16,
         device_map={"": device_id},
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
     )
     return processor, model
 def caption_batch(batch, processor, model):
     images = batch["image"]
     pil_images = []
     for image in images:
         if not isinstance(image, Image.Image):
             image = image.convert("RGB")
         pil_images.append(image)
+    msg = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {
+                    "type": "text",
+                    "text": "Describe the image concisely, and skip mentioning that it's illustrated or from anime.",
+                },
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        msg, add_generation_prompt=True, tokenize=False
     )
+    texts = [text] * len(pil_images)
+    inputs = processor(text=texts, images=pil_images, return_tensors="pt", padding=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
         generated = model.generate(
             **inputs,
+            max_new_tokens=128,
+            do_sample=False,
+            use_cache=True,
         )
     decoded = processor.batch_decode(generated, skip_special_tokens=False)
     captions = []
+    special_tokens = set(processor.tokenizer.all_special_tokens)
     for d in decoded:
         if "<|im_start|>assistant" in d:
+            d = d.split("<|im_start|>assistant")[-1]
         for token in special_tokens:
             d = d.replace(token, "")
         d = d.strip()
         captions.append(d)
     return {
         "text": captions,
     }
 def process_shard_worker(
     gpu_id, start, end, model_name, batch_size, input_dataset, output_file
 ):
     torch.cuda.set_device(gpu_id)
     print(f"[GPU {gpu_id}] Loading model...", flush=True)
     else:
         shard = cast(Dataset, loaded)
+    shard = shard.with_format("torch")
+    shard.set_format(type="torch", columns=["image"])
     print(f"[GPU {gpu_id}] Processing {len(shard)} examples...", flush=True)
     result = shard.map(
         lambda batch: caption_batch(batch, processor, model),
         batched=True,
         batch_size=batch_size,
+        remove_columns=[col for col in shard.column_names if col != "image"],
+        writer_batch_size=1000,
+        keep_in_memory=True,
     )
     print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)
 def main():
     input_dataset = "none-yet/anime-captions"
+    output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
+    batch_size = 32
     print("Loading dataset info...")
     loaded = datasets.load_dataset(input_dataset, split="train")
     print(f"Final dataset size: {len(final_ds)}")
     print("Pushing to hub...")
+    final_ds.push_to_hub(output_dataset, create_pr=False)
     print("Cleaning up temporary files...")
     for f in temp_files: