create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 17

Commit

c446fbc

verified ·

1 Parent(s): 789627e

Update train.py

Browse files

Files changed (1) hide show

train.py +20 -17

train.py CHANGED Viewed

@@ -50,35 +50,38 @@ def getTemplate(processor):
     )
-def preprocess_example(example, processor):
-    image = example["image"]
-    if isinstance(image, Image.Image):
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-    else:
-        raise ValueError("Image must be a PIL Image")
-    text = getTemplate(processor)
     return {
-        "image": image,
-        "text_prompt": text,
     }
-def run_preprocessing(input_dataset, output_dir, num_proc=4):
     print("Loading dataset for preprocessing...")
     ds = datasets.load_dataset(input_dataset, split="train")
     print("Loading processor...")
     processor = AutoProcessor.from_pretrained("datalab-to/chandra")
     print("Running preprocessing...")
     processed_ds = ds.map(
-        lambda ex: preprocess_example(ex, processor),
-        remove_columns=[
-            col for col in ds.column_names if col not in ["image", "text_prompt"]
-        ],
         num_proc=num_proc,
     )
     print(f"Saving preprocessed dataset to {output_dir}...")
@@ -88,7 +91,7 @@ def run_preprocessing(input_dataset, output_dir, num_proc=4):
 def caption_batch(batch, processor, model):
     images = batch["image"]
-    texts = batch["text_prompt"]
     inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
@@ -141,7 +144,7 @@ def process_shard(
             lambda batch: caption_batch(batch, processor, model),
             batched=True,
             batch_size=batch_size,
-            remove_columns=["text_prompt"],
         )
         print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)

     )
+def preprocess_example_batch(examples, text):
+    processed_images = []
+    for image in examples["image"]:
+        if isinstance(image, Image.Image):
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            processed_images.append(image)
+        else:
+            raise ValueError("Image must be a PIL Image")
     return {
+        "image": processed_images,
+        "text": [text] * len(processed_images),
     }
+def run_preprocessing(input_dataset, output_dir, num_proc=32, batch_size=100):
     print("Loading dataset for preprocessing...")
     ds = datasets.load_dataset(input_dataset, split="train")
     print("Loading processor...")
     processor = AutoProcessor.from_pretrained("datalab-to/chandra")
+    text = getTemplate(processor)
     print("Running preprocessing...")
     processed_ds = ds.map(
+        lambda ex: preprocess_example_batch(ex, text),
+        remove_columns=[col for col in ds.column_names if col not in ["image", "text"]],
         num_proc=num_proc,
+        batched=True,
+        batch_size=batch_size,
     )
     print(f"Saving preprocessed dataset to {output_dir}...")
 def caption_batch(batch, processor, model):
     images = batch["image"]
+    texts = batch["text"]
     inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
             lambda batch: caption_batch(batch, processor, model),
             batched=True,
             batch_size=batch_size,
+            remove_columns=["text"],
         )
         print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)