create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 17

Commit

789627e

verified ·

1 Parent(s): c79fe94

Update train.py

Browse files

Files changed (1) hide show

train.py +95 -124

train.py CHANGED Viewed

@@ -31,7 +31,7 @@ def load_model(model_name, device_id=0):
     return processor, model
-def build_template(processor):
     msg = [
         {
             "role": "user",
@@ -44,92 +44,54 @@ def build_template(processor):
             ],
         }
     ]
     return processor.apply_chat_template(
         msg, add_generation_prompt=True, tokenize=False
     )
-def iterable_to_map(ds, chunk_size=10000):
-    buffer = []
-    for ex in ds:
-        buffer.append(ex)
-        if len(buffer) >= chunk_size:
-            yield buffer
-            buffer = []
-def cpu_preprocess(input_dataset, output_folder, model_name):
-    print("CPU preprocessing…")
-    processor = AutoProcessor.from_pretrained(model_name)
-    template = build_template(processor)
-    def _pp(batch):
-        out_images = []
-        for img in batch["image"]:
-            if isinstance(img, Image.Image):
-                if img.mode != "RGB":
-                    img = img.convert("RGB")
-            out_images.append(img)
-        prompts = [template] * len(out_images)
-        return {
-            "image": out_images,
-            "prompt": prompts,
-        }
     ds = datasets.load_dataset(input_dataset, split="train")
-    if ds is None:
-        raise ValueError(
-            f"Failed to load dataset '{input_dataset}' with split 'train'. Check the dataset name or available splits."
-        )
-    if isinstance(ds, datasets.DatasetDict):
-        if "train" in ds:
-            ds = ds["train"]
-        else:
-            raise ValueError(
-                f"'{input_dataset}' does not contain a 'train' split. Available splits: {list(ds.keys())}"
-            )
-    if not isinstance(ds, datasets.Dataset):
-        raise TypeError(f"Expected a Dataset instance, got {type(ds)}")
-    print(f"Dataset loaded: {len(ds)} examples")
-    ds2 = ds.map(
-        _pp,
-        batched=True,
-        remove_columns=[c for c in ds.column_names if c not in ("image",)],
     )
-    print("Saving CPU-preprocessed dataset…")
-    parts = []
-    for chunk in iterable_to_map(ds2):
-        part = Dataset.from_list(chunk)
-        parts.append(part)
-    ds2 = datasets.concatenate_datasets(parts)
-    ds2.save_to_disk(output_folder)
-    print("CPU preprocessing done.")
-def caption_batch(batch, processor, model):
-    imgs = batch["image"]
-    prompts = batch["prompt"]
-    pil_images = []
-    for image in imgs:
-        if isinstance(image, Image.Image):
-            if image.mode != "RGB":
-                image = image.convert("RGB")
-            pil_images.append(image)
-    inputs = processor(
-        text=prompts, images=pil_images, return_tensors="pt", padding=True
-    )
     inputs = {
         k: v.pin_memory().to(model.device, non_blocking=True) for k, v in inputs.items()
     }
@@ -144,47 +106,49 @@ def caption_batch(batch, processor, model):
     decoded = processor.batch_decode(generated, skip_special_tokens=False)
     captions = []
-    special = set(processor.tokenizer.all_special_tokens)
     for d in decoded:
         if "<|im_start|>assistant" in d:
             d = d.split("<|im_start|>assistant")[-1]
-        for token in special:
             d = d.replace(token, "")
-        captions.append(d.strip())
-    return {"text": captions}
 def process_shard(
-    gpu_id, start, end, model_name, batch_size, prepped_folder, output_file
 ):
     try:
         torch.cuda.set_device(gpu_id)
-        print(f"[GPU {gpu_id}] Loading model…", flush=True)
         processor, model = load_model(model_name, gpu_id)
-        print(f"[GPU {gpu_id}] Loading preprocessed shard [{start}:{end}]…", flush=True)
-        shard = datasets.load_from_disk(prepped_folder)
-        if isinstance(shard, datasets.DatasetDict):
-            shard = shard["train"]
-        shard = shard.select(range(start, end))
-        print(f"[GPU {gpu_id}] Captioning {len(shard)} examples…", flush=True)
         result = shard.map(
             lambda batch: caption_batch(batch, processor, model),
             batched=True,
             batch_size=batch_size,
-            remove_columns=["image", "prompt"],
         )
-        print(f"[GPU {gpu_id}] Saving {output_file}…", flush=True)
         result.save_to_disk(output_file)
-        print(f"[GPU {gpu_id}] Done.", flush=True)
         return output_file
     except Exception as e:
         print(f"[GPU {gpu_id}] Error: {e}", flush=True)
         raise
@@ -194,37 +158,44 @@ def main():
     mp.set_start_method("spawn", force=True)
     input_dataset = "none-yet/anime-captions"
-    prepped_folder = "cpu_preprocessed"
     output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
     batch_size = 20
-    if not os.path.exists(prepped_folder):
-        cpu_preprocess(input_dataset, prepped_folder, model_name)
-    ds = datasets.load_from_disk(prepped_folder)
-    total = len(ds)
     num_gpus = torch.cuda.device_count()
-    shard = total // num_gpus
-    print(f"Dataset size: {total}")
     print(f"Using {num_gpus} GPUs")
-    print(f"Shard size: {shard}")
     processes = []
     temp_files = []
     for i in range(num_gpus):
-        s = i * shard
-        e = s + shard if i < num_gpus - 1 else total
-        of = f"temp_shard_{i}"
-        temp_files.append(of)
         p = mp.Process(
             target=process_shard,
-            args=(i, s, e, model_name, batch_size, prepped_folder, of),
         )
         p.start()
         processes.append(p)
@@ -232,32 +203,32 @@ def main():
     for p in processes:
         p.join()
         if p.exitcode != 0:
-            print("A process failed, aborting…")
-            for q in processes:
-                if q.is_alive():
-                    q.terminate()
-            for q in processes:
-                q.join()
-            raise RuntimeError("GPU worker failed.")
-    print("Merging shards…")
-    parts = []
-    for f in temp_files:
-        ds = datasets.load_from_disk(f)
-        if isinstance(ds, datasets.DatasetDict):
-            ds = ds["train"]
-        parts.append(ds)
-    final_ds = datasets.concatenate_datasets(parts)
-    print(f"Pushing final dataset to {output_dataset}…")
     final_ds.push_to_hub(output_dataset, create_pr=False)
-    print("Cleaning up…")
     for f in temp_files:
-        shutil.rmtree(f, ignore_errors=True)
-    print("Done.")
 if __name__ == "__main__":

     return processor, model
+def getTemplate(processor):
     msg = [
         {
             "role": "user",
             ],
         }
     ]
     return processor.apply_chat_template(
         msg, add_generation_prompt=True, tokenize=False
     )
+def preprocess_example(example, processor):
+    image = example["image"]
+    if isinstance(image, Image.Image):
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+    else:
+        raise ValueError("Image must be a PIL Image")
+    text = getTemplate(processor)
+    return {
+        "image": image,
+        "text_prompt": text,
+    }
+def run_preprocessing(input_dataset, output_dir, num_proc=4):
+    print("Loading dataset for preprocessing...")
     ds = datasets.load_dataset(input_dataset, split="train")
+    print("Loading processor...")
+    processor = AutoProcessor.from_pretrained("datalab-to/chandra")
+    print("Running preprocessing...")
+    processed_ds = ds.map(
+        lambda ex: preprocess_example(ex, processor),
+        remove_columns=[
+            col for col in ds.column_names if col not in ["image", "text_prompt"]
+        ],
+        num_proc=num_proc,
     )
+    print(f"Saving preprocessed dataset to {output_dir}...")
+    processed_ds.save_to_disk(output_dir)
+    print("Preprocessing done.")
+def caption_batch(batch, processor, model):
+    images = batch["image"]
+    texts = batch["text_prompt"]
+    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
     inputs = {
         k: v.pin_memory().to(model.device, non_blocking=True) for k, v in inputs.items()
     }
     decoded = processor.batch_decode(generated, skip_special_tokens=False)
     captions = []
+    special_tokens = set(processor.tokenizer.all_special_tokens)
     for d in decoded:
         if "<|im_start|>assistant" in d:
             d = d.split("<|im_start|>assistant")[-1]
+        for token in special_tokens:
             d = d.replace(token, "")
+        d = d.strip()
+        captions.append(d)
+    return {
+        "text": captions,
+    }
 def process_shard(
+    gpu_id, start, end, model_name, batch_size, input_dataset, output_file
 ):
     try:
         torch.cuda.set_device(gpu_id)
+        print(f"[GPU {gpu_id}] Loading model...", flush=True)
         processor, model = load_model(model_name, gpu_id)
+        print(f"[GPU {gpu_id}] Loading data shard [{start}:{end}]...", flush=True)
+        loaded = datasets.load_from_disk(input_dataset).select(range(start, end))
+        shard = cast(Dataset, loaded)
+        print(f"[GPU {gpu_id}] Processing {len(shard)} examples...", flush=True)
         result = shard.map(
             lambda batch: caption_batch(batch, processor, model),
             batched=True,
             batch_size=batch_size,
+            remove_columns=["text_prompt"],
         )
+        print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)
         result.save_to_disk(output_file)
+        print(f"[GPU {gpu_id}] Done!", flush=True)
         return output_file
     except Exception as e:
         print(f"[GPU {gpu_id}] Error: {e}", flush=True)
         raise
     mp.set_start_method("spawn", force=True)
     input_dataset = "none-yet/anime-captions"
+    preprocessed_dataset = "temp_preprocessed"
     output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
     batch_size = 20
+    if not os.path.exists(preprocessed_dataset):
+        run_preprocessing(input_dataset, preprocessed_dataset)
+    print("Loading preprocessed dataset...")
+    ds = datasets.load_from_disk(preprocessed_dataset)
     num_gpus = torch.cuda.device_count()
+    total_size = len(ds)
+    shard_size = total_size // num_gpus
+    print(f"Dataset size: {total_size}")
     print(f"Using {num_gpus} GPUs")
+    print(f"Shard size: {shard_size}")
     processes = []
     temp_files = []
     for i in range(num_gpus):
+        start = i * shard_size
+        end = start + shard_size if i < num_gpus - 1 else total_size
+        output_file = f"temp_shard_{i}"
+        temp_files.append(output_file)
         p = mp.Process(
             target=process_shard,
+            args=(
+                i,
+                start,
+                end,
+                model_name,
+                batch_size,
+                preprocessed_dataset,
+                output_file,
+            ),
         )
         p.start()
         processes.append(p)
     for p in processes:
         p.join()
         if p.exitcode != 0:
+            print(f"\nProcess failed with exit code {p.exitcode}", flush=True)
+            print("Terminating all processes...", flush=True)
+            for proc in processes:
+                if proc.is_alive():
+                    proc.terminate()
+            for proc in processes:
+                proc.join()
+            raise RuntimeError(f"At least one process failed")
+    print("\nAll processes completed. Loading and concatenating results...")
+    shards = [cast(Dataset, datasets.load_from_disk(f)) for f in temp_files]
+    final_ds = datasets.concatenate_datasets(shards)
+    print(f"Final dataset size: {len(final_ds)}")
+    print("Pushing to hub...")
     final_ds.push_to_hub(output_dataset, create_pr=False)
+    print("Cleaning up temporary files...")
     for f in temp_files:
+        if os.path.exists(f):
+            shutil.rmtree(f)
+    if os.path.exists(preprocessed_dataset):
+        shutil.rmtree(preprocessed_dataset)
+    print("Done!")
 if __name__ == "__main__":