create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 17

Commit

e1f7a3c

verified ·

1 Parent(s): e6ad1f5

Update train.py

Browse files

Files changed (1) hide show

train.py +209 -107

train.py CHANGED Viewed

@@ -1,121 +1,206 @@
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
 import datasets
 from datasets import Dataset
-from typing import cast
-import os
-import shutil
-import multiprocessing as mp
 from PIL import Image
-def load_model(model_name, device_id=0):
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=False,
     )
     processor = AutoProcessor.from_pretrained(model_name)
-    processor.tokenizer.padding_side = "left"
     model = AutoModelForImageTextToText.from_pretrained(
         model_name,
         quantization_config=bnb_config,
         dtype=torch.bfloat16,
-        device_map={"": device_id},
         attn_implementation="flash_attention_2",
     )
     return processor, model
-def caption_batch(batch, processor, model):
-    images = batch["image"]
     pil_images = []
-    for image in images:
-        if isinstance(image, Image.Image):
-            if image.mode != "RGB":
-                image = image.convert("RGB")
-            pil_images.append(image)
-    msg = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {
-                    "type": "text",
-                    "text": "Describe the image concisely, and skip mentioning that it's illustrated or from anime.",
-                },
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(
-        msg, add_generation_prompt=True, tokenize=False
-    )
-    texts = [text] * len(pil_images)
     inputs = processor(text=texts, images=pil_images, return_tensors="pt", padding=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad(), torch.amp.autocast('cuda', dtype=torch.bfloat16):
         generated = model.generate(
             **inputs,
             max_new_tokens=128,
             do_sample=False,
         )
-    decoded = processor.batch_decode(generated, skip_special_tokens=False)
-    captions = []
-    special_tokens = set(processor.tokenizer.all_special_tokens)
-    for d in decoded:
-        if "<|im_start|>assistant" in d:
-            d = d.split("<|im_start|>assistant")[-1]
-        for token in special_tokens:
-            d = d.replace(token, "")
-        d = d.strip()
-        captions.append(d)
-    return {
-        "text": captions,
-    }
-def process_shard(gpu_id, start, end, model_name, batch_size, input_dataset, output_file):
     try:
-        torch.cuda.set_device(gpu_id)
         print(f"[GPU {gpu_id}] Loading model...", flush=True)
-        processor, model = load_model(model_name, gpu_id)
-        print(f"[GPU {gpu_id}] Loading data shard [{start}:{end}]...", flush=True)
-        loaded = datasets.load_dataset(input_dataset, split=f"train[{start}:{end}]")
-        if isinstance(loaded, datasets.DatasetDict):
-            shard = cast(Dataset, loaded["train"])
-        else:
-            shard = cast(Dataset, loaded)
-        print(f"[GPU {gpu_id}] Processing {len(shard)} examples...", flush=True)
         result = shard.map(
-            lambda batch: caption_batch(batch, processor, model),
             batched=True,
-            batch_size=batch_size,
-            remove_columns=[col for col in shard.column_names if col != "image"],
         )
-        print(f"[GPU {gpu_id}] Saving results to {output_file}...", flush=True)
         result.save_to_disk(output_file)
-        print(f"[GPU {gpu_id}] Done!", flush=True)
         return output_file
     except Exception as e:
         print(f"[GPU {gpu_id}] Error: {e}", flush=True)
@@ -123,72 +208,89 @@ def process_shard(gpu_id, start, end, model_name, batch_size, input_dataset, out
 def main():
-    mp.set_start_method('spawn', force=True)
-    input_dataset = "none-yet/anime-captions"
-    output_dataset = "nroggendorff/anime-captions"
-    model_name = "datalab-to/chandra"
-    batch_size = 16
-    print("Loading dataset info...")
-    loaded = datasets.load_dataset(input_dataset, split="train")
-    if isinstance(loaded, datasets.DatasetDict):
-        ds = cast(Dataset, loaded["train"])
     else:
-        ds = cast(Dataset, loaded)
-    num_gpus = torch.cuda.device_count()
     total_size = len(ds)
     shard_size = total_size // num_gpus
-    print(f"Dataset size: {total_size}")
-    print(f"Using {num_gpus} GPUs")
-    print(f"Shard size: {shard_size}")
     processes = []
-    temp_files = []
     for i in range(num_gpus):
         start = i * shard_size
         end = start + shard_size if i < num_gpus - 1 else total_size
-        output_file = f"temp_shard_{i}"
-        temp_files.append(output_file)
         p = mp.Process(
             target=process_shard,
-            args=(i, start, end, model_name, batch_size, input_dataset, output_file),
         )
         p.start()
         processes.append(p)
     for p in processes:
         p.join()
         if p.exitcode != 0:
-            print(f"\nProcess failed with exit code {p.exitcode}", flush=True)
-            print("Terminating all processes...", flush=True)
-            for proc in processes:
-                if proc.is_alive():
-                    proc.terminate()
-            for proc in processes:
-                proc.join()
-            raise RuntimeError(f"At least one process failed")
-    print("\nAll processes completed. Loading and concatenating results...")
-    shards = [cast(Dataset, datasets.load_from_disk(f)) for f in temp_files]
     final_ds = datasets.concatenate_datasets(shards)
-    print(f"Final dataset size: {len(final_ds)}")
-    print("Pushing to hub...")
-    final_ds.push_to_hub(output_dataset, create_pr=False)
-    print("Cleaning up temporary files...")
-    for f in temp_files:
-        if os.path.exists(f):
-            shutil.rmtree(f)
-    print("Done!")
 if __name__ == "__main__":

+# caption_pipeline_fast.py
+import os
+import shutil
+import io
+import multiprocessing as mp
+from typing import Tuple, Dict, Any
 import torch
 from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
 import datasets
 from datasets import Dataset
 from PIL import Image
+# -------------------------
+# CONFIG
+# -------------------------
+INPUT_DATASET = "none-yet/anime-captions"   # original dataset id / path
+PREPROCESSED_DIR = "preprocessed_ds"        # temporary preprocessed dataset on disk
+TEMP_SHARD_PREFIX = "temp_shard_"           # per-GPU result dirs
+OUTPUT_DATASET = "nroggendorff/anime-captions"
+MODEL_NAME = "datalab-to/chandra"
+BATCH_SIZE = 32                             # try 32 or 64 depending on VRAM
+PREPROCESS_NUM_PROC = max(1, mp.cpu_count() - 2)
+DEVICE_BATCH_PREPIN = True                  # pin memory before to(device)
+USE_BETTERTRANSFORMER = True                # try BetterTransformer if installed
+# -------------------------
+def preprocess_example(example: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Convert image to RGB bytes and store the prompt string once per example.
+    This is run in main process (once).
+    """
+    img = example["image"]
+    if not isinstance(img, Image.Image):
+        # datasets Image feature may already give PIL or path - handle both
+        try:
+            img = Image.open(io.BytesIO(img))  # if raw bytes
+        except Exception:
+            # fall back to the feature handling
+            img = img.convert("RGB")
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    bio = io.BytesIO()
+    img.save(bio, format="PNG")   # PNG keeps quality and is easy to decode later
+    example["image_bytes"] = bio.getvalue()
+    # keep the original image field for compatibility if you want
+    # but we'll use image_bytes in workers
+    return example
+def prepare_and_save_dataset(input_dataset: str, processor_chat_prompt: str) -> None:
+    """
+    Loads dataset once, preprocesses images to bytes, writes a
+    new field 'image_bytes' and saves to PREPROCESSED_DIR.
+    """
+    print("[main] Loading dataset for preprocessing...")
+    loaded = datasets.load_dataset(input_dataset, split="train")
+    ds = loaded if not isinstance(loaded, datasets.DatasetDict) else loaded["train"]
+    # Remove any columns we don't need (keep image) to save space
+    # But keep other metadata if needed
+    cols_to_remove = [c for c in ds.column_names if c not in ("image",)]
+    if cols_to_remove:
+        ds = ds.remove_columns(cols_to_remove)
+    print(f"[main] Preprocessing images to bytes with {PREPROCESS_NUM_PROC} procs...")
+    ds = ds.map(preprocess_example, remove_columns=[], num_proc=PREPROCESS_NUM_PROC)
+    # store the constant chat template string in dataset (small redundancy) to avoid recomputing
+    print("[main] Storing prompt string per example (small overhead)...")
+    ds = ds.add_column("prompt", [processor_chat_prompt] * len(ds))
+    # save to disk for fast worker access (preprocessed once)
+    if os.path.exists(PREPROCESSED_DIR):
+        shutil.rmtree(PREPROCESSED_DIR)
+    print(f"[main] Saving preprocessed dataset to {PREPROCESSED_DIR} ...")
+    ds.save_to_disk(PREPROCESSED_DIR)
+    print("[main] Preprocessing complete.")
+def load_model_for_gpu(model_name: str, gpu_id: int):
+    """
+    Load model + processor on the target GPU with 4-bit config (like your original)
+    """
+    torch.cuda.set_device(gpu_id)
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
         bnb_4bit_compute_dtype=torch.bfloat16,
         bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
     )
     processor = AutoProcessor.from_pretrained(model_name)
+    # keep left padding as you had
+    try:
+        processor.tokenizer.padding_side = "left"
+    except Exception:
+        pass
     model = AutoModelForImageTextToText.from_pretrained(
         model_name,
         quantization_config=bnb_config,
         dtype=torch.bfloat16,
+        device_map={"": gpu_id},
         attn_implementation="flash_attention_2",
     )
+    # Try BetterTransformer if available
+    if USE_BETTERTRANSFORMER:
+        try:
+            from optimum.bettertransformer import BetterTransformer
+            model = BetterTransformer.transform(model)
+            print(f"[GPU {gpu_id}] Applied BetterTransformer.")
+        except Exception:
+            # not fatal
+            print(f"[GPU {gpu_id}] BetterTransformer unavailable or failed; continuing.")
+    model.eval()
     return processor, model
+def caption_batch_from_bytes(batch: Dict[str, Any], processor, model) -> Dict[str, Any]:
+    """
+    Given a batch from the preprocessed dataset (contains 'image_bytes' and 'prompt'),
+    reconstruct PIL images, call processor, run generate, decode, and return texts.
+    """
+    image_bytes_list = batch["image_bytes"]
+    prompts = batch["prompt"]
+    assert len(image_bytes_list) == len(prompts)
     pil_images = []
+    for b in image_bytes_list:
+        img = Image.open(io.BytesIO(b))
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        pil_images.append(img)
+    # processor.apply_chat_template was already run on main, so prompts are ready strings
+    texts = list(prompts)
+    # Build inputs. This step will perform tokenizer + image feature extraction.
     inputs = processor(text=texts, images=pil_images, return_tensors="pt", padding=True)
+    # Pin memory for faster host->device copy if enabled
+    if DEVICE_BATCH_PREPIN:
+        for k, v in inputs.items():
+            if torch.is_tensor(v):
+                inputs[k] = v.pin_memory()
+    # Move to device with non_blocking transfer (works with pinned memory)
+    device = model.device
+    inputs = {k: (v.to(device, non_blocking=True) if torch.is_tensor(v) else v) for k, v in inputs.items()}
+    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
         generated = model.generate(
             **inputs,
             max_new_tokens=128,
             do_sample=False,
+            num_beams=1,
         )
+    # decode skipping special tokens to avoid expensive post-processing
+    decoded = processor.batch_decode(generated, skip_special_tokens=True)
+    # clean and return
+    return {"text": [d.strip() for d in decoded]}
+def process_shard(gpu_id: int, start: int, end: int, output_file: str):
+    """
+    Worker process: loads the preprocessed dataset shard, loads the model on the GPU,
+    runs batched generation and saves the results to disk.
+    """
     try:
         print(f"[GPU {gpu_id}] Loading model...", flush=True)
+        processor, model = load_model_for_gpu(MODEL_NAME, gpu_id)
+        print(f"[GPU {gpu_id}] Loading preprocessed dataset from disk...", flush=True)
+        ds = datasets.load_from_disk(PREPROCESSED_DIR)
+        # slice with select for a true copy
+        indices = list(range(start, end))
+        shard = ds.select(indices)
+        print(f"[GPU {gpu_id}] Processing {len(shard)} examples (shard indices {start}:{end}) ...", flush=True)
+        # map with batched generator function (uses our caption_batch_from_bytes)
         result = shard.map(
+            lambda batch: caption_batch_from_bytes(batch, processor, model),
             batched=True,
+            batch_size=BATCH_SIZE,
+            remove_columns=[col for col in shard.column_names if col not in ("image_bytes", "prompt")],
+            num_proc=1,  # model inference must run in the GPU process (no multiproc here)
         )
+        print(f"[GPU {gpu_id}] Saving results to {output_file} ...", flush=True)
+        if os.path.exists(output_file):
+            shutil.rmtree(output_file)
         result.save_to_disk(output_file)
+        print(f"[GPU {gpu_id}] Done.", flush=True)
         return output_file
     except Exception as e:
         print(f"[GPU {gpu_id}] Error: {e}", flush=True)
 def main():
+    mp.set_start_method("spawn", force=True)
+    # 1) Load processor temporarily to build the chat prompt once
+    print("[main] Loading processor to create chat prompt...")
+    tmp_proc = AutoProcessor.from_pretrained(MODEL_NAME)
+    chat_msg = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {
+                    "type": "text",
+                    "text": "Describe the image concisely, and skip mentioning that it's illustrated or from anime.",
+                },
+            ],
+        }
+    ]
+    # keep tokenize=False so we store the raw prompt and let processor tokenize in workers with padding semantics
+    prompt_str = tmp_proc.apply_chat_template(chat_msg, add_generation_prompt=True, tokenize=False)
+    del tmp_proc
+    # 2) Preprocess dataset once (images -> bytes, add prompt column)
+    if not os.path.exists(PREPROCESSED_DIR):
+        prepare_and_save_dataset(INPUT_DATASET, prompt_str)
     else:
+        print(f"[main] Preprocessed dataset found at {PREPROCESSED_DIR}, skipping preprocess.")
+    # 3) Load the preprocessed dataset to compute shard indices
+    ds = datasets.load_from_disk(PREPROCESSED_DIR)
     total_size = len(ds)
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPUs found. This script requires GPUs.")
     shard_size = total_size // num_gpus
+    print(f"[main] Dataset size: {total_size}")
+    print(f"[main] Using {num_gpus} GPUs (shard size {shard_size})")
+    # 4) Spawn worker processes
     processes = []
+    temp_dirs = []
     for i in range(num_gpus):
         start = i * shard_size
         end = start + shard_size if i < num_gpus - 1 else total_size
+        out_dir = f"{TEMP_SHARD_PREFIX}{i}"
+        temp_dirs.append(out_dir)
         p = mp.Process(
             target=process_shard,
+            args=(i, start, end, out_dir),
+            daemon=False,
         )
         p.start()
         processes.append(p)
+    # 5) wait for processes
     for p in processes:
         p.join()
         if p.exitcode != 0:
+            print(f"[main] Process {p.pid} failed with exit code {p.exitcode}. Terminating others.", flush=True)
+            for q in processes:
+                if q.is_alive():
+                    q.terminate()
+            for q in processes:
+                q.join()
+            raise RuntimeError("At least one GPU worker failed.")
+    print("[main] All workers finished. Concatenating shards...")
+    shards = [datasets.load_from_disk(d) for d in temp_dirs]
     final_ds = datasets.concatenate_datasets(shards)
+    print(f"[main] Final dataset size: {len(final_ds)}. Pushing to hub as {OUTPUT_DATASET} ...")
+    final_ds.push_to_hub(OUTPUT_DATASET, create_pr=False)
+    print("[main] Cleaning up temporary files...")
+    for d in temp_dirs:
+        if os.path.exists(d):
+            shutil.rmtree(d)
+    # optionally keep PREPROCESSED_DIR for re-runs; comment out removal if you want to keep it
+    # shutil.rmtree(PREPROCESSED_DIR)
+    print("[main] Done.")
 if __name__ == "__main__":