Spaces:

GlobalStudio
/

starflow

Sleeping

App Files Files Community

leoeric commited on 5 days ago

Commit

5db3c1f

1 Parent(s): 0194c79

Add dataset.py and fix OpenMP warning

Browse files

Files changed (2) hide show

app.py +3 -0
dataset.py +929 -0

app.py CHANGED Viewed

@@ -12,6 +12,9 @@ import subprocess
 import pathlib
 from pathlib import Path
 # Try to import huggingface_hub for downloading checkpoints
 try:
     from huggingface_hub import hf_hub_download

 import pathlib
 from pathlib import Path
+# Fix OpenMP warning
+os.environ['OMP_NUM_THREADS'] = '1'
 # Try to import huggingface_hub for downloading checkpoints
 try:
     from huggingface_hub import hf_hub_download

dataset.py ADDED Viewed

	@@ -0,0 +1,929 @@

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2025 Apple Inc. All Rights Reserved.
+#
+import io
+import os
+import csv
+import json
+import random
+import torch
+import numpy as np
+import math
+import time
+import contextlib
+from typing import Optional, Union
+from PIL import Image
+from collections import defaultdict
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from torch.utils.data import default_collate, get_worker_info
+import tarfile
+import tqdm
+import gc
+import threading
+import psutil
+import tempfile
+import decord
+from decord import VideoReader
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor, TimeoutError
+from misc import print, xprint
+from misc.condition_utils import get_camera_condition, get_point_condition, get_wind_condition
+# Initialize multiprocessing manager
+manager = torch.multiprocessing.Manager()
+# ==== helpers ==== #
+@contextlib.contextmanager
+def ram_temp_file(data, suffix=".mp4"):
+    available_ram = psutil.virtual_memory().available
+    video_size = len(data)
+    # Use RAM if available, otherwise fall back to disk
+    if video_size < available_ram - (500 * 1024 * 1024):
+        temp_dir = "/dev/shm"  # RAM disk
+    else:
+        temp_dir = None  # Default system temp (disk)
+    with tempfile.NamedTemporaryFile(dir=temp_dir, suffix=suffix, delete=True) as temp_file:
+        temp_file.write(data)
+        temp_file.flush()
+        yield temp_file.name
+def _nearest_multiple(x: float, base: int = 8) -> int:
+    """Round x to the nearest multiple of `base`."""
+    return int(round(x / base)) * base
+def aspect_ratio_to_image_size(target_size, R, multiple=8):
+    if R is None:
+        return target_size, target_size
+    if isinstance(R, str):
+        rw, rh = map(int, R.split(':'))
+        R = rw / rh
+    area  = target_size ** 2
+    out_h = _nearest_multiple(math.sqrt(area / R), multiple)
+    out_w = _nearest_multiple(math.sqrt(area * R), multiple)
+    return out_h, out_w
+def read_tsv(filename):
+    # Open the TSV file for reading
+    with open(filename, 'r', newline='') as tsvfile:
+        reader = csv.reader(tsvfile, delimiter='\t')
+        rows = []
+        while True:
+            try:
+                r = next(reader)
+                rows.append(r)
+            except csv.Error as e:
+                print(f'{e}')
+            except StopIteration:
+                break
+        return rows
+def sample_clip(
+    video_path: str,
+    num_frames: int = 8,
+    out_fps: Optional[float] = None,      # ← pass an fps here
+):
+    vr       = VideoReader(video_path)
+    src_fps  = vr.get_avg_fps()        # native fps
+    total    = len(vr)
+    if out_fps is None or out_fps >= src_fps:
+        step = 1                       # keep native rate or up-sample later
+    else:
+        target_duration = (num_frames - 1) / out_fps  # duration in seconds
+        frame_span = target_duration * src_fps   # frames needed for this duration
+        step = max(frame_span / (num_frames - 1), 1)
+    max_start = total - step * (num_frames - 1)
+    if max_start <= 1:  # video too short for requested clip
+        indices = np.linspace(0, total - 1, num_frames, dtype=int)
+        return vr.get_batch(indices.tolist()), indices
+    max_start = int(np.floor(max_start - 1))
+    start  = random.randint(0, max_start) if max_start > 0 else 0
+    idxs   = [int(np.round(start + i * step)) for i in range(num_frames)]
+    return vr.get_batch(idxs), idxs
+class InfiniteDataLoader(torch.utils.data.DataLoader):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Initialize an iterator over the dataset.
+        self.dataset_iterator = super().__iter__()
+    def __iter__(self):
+        return self
+    def __next__(self):
+        try:
+            batch = next(self.dataset_iterator)
+        except StopIteration:
+            # Dataset exhausted, use a new fresh iterator.
+            print('Another Loop over the dataset', flush=True)
+            self.dataset_iterator = super().__iter__()
+            batch = next(self.dataset_iterator)
+        return batch
+class DataLoaderWrapper(InfiniteDataLoader):
+    def __iter__(self):
+        return IterWrapper(super().__iter__())
+class IterWrapper:
+    def __init__(self, obj):
+        self.obj = obj
+    def __iter__(self):
+        return self
+    def __next__(self):
+        return self.next()
+    def next(self):
+        return next(self.obj)
+# ==== Dataset Implementation, Load your own data ==== #
+class ImageTarDataset(Dataset):
+    def __init__(self, dataset_tsv, image_size, temporal_size=None, rank=0,  world_size=1,
+                 use_image_bucket=False, multiple=8, no_flip=False, edit=False):
+        all_lines = []
+        # get all data lines
+        self.buckets = {}
+        self.weights = {}
+        self.image_buckets = defaultdict(lambda: 0)
+        self.image_buckets['1:1'] = 0  # default bucket
+        skipped = 0
+        for line in tqdm.tqdm(read_tsv(dataset_tsv)[1:]):
+            tsv_file = line[0]
+            bucket   = line[1] if len(line) > 1 else 'mlx'
+            caption  = line[2] if len(line) > 2 else 'caption'
+            weights  = float(line[3] if len(line) > 3 else "1")
+            all_data = read_tsv(tsv_file)
+            all_maps = {all_data[0][i]: i for i in range(len(all_data[0]))}
+            self.weights[all_data[1][0]] = weights
+            for line in all_data[1:]:
+                try:
+                    if 'width' in all_maps:  # filter too small images
+                        width, height = int(line[all_maps['width']]), int(line[all_maps['height']])
+                        if width * height < (image_size * image_size) / 2:  # if image is smaller than half size of the target size
+                            skipped += 1; continue
+                    if caption != 'folder':  # input caption has higher priority
+                        captions  = caption.split('|')[0].split(':')
+                        operation = caption.split('|')[1] if len(caption.split('|')) > 1 else "none"
+                        caption_line = ([line[all_maps[c]] for c in captions], operation)
+                    else:
+                        caption_line = (line[all_maps['file']].split('/')[-2], "none")  # use folder name as caption
+                    items = {'tar': line[all_maps['tar']], 'file': line[all_maps['file']], 'caption': caption_line,
+                             'image_bucket': line[all_maps['image_bucket']] if 'image_bucket' in all_maps else "1:1"}
+                    if "camera_file" in all_maps: # dl3dv data
+                        items["camera_file"] = line[all_maps["camera_file"]]
+                    if "force_caption" in all_maps: # force dataset
+                        items["force_caption"] = line[all_maps["force_caption"]]
+                        if "wind_speed" in all_maps: # wind force
+                            items["wind_speed"] = line[all_maps["wind_speed"]]
+                            items["wind_angle"] = line[all_maps["wind_angle"]]
+                        elif "force" in all_maps: # point-wise
+                            items["force"] = line[all_maps["force"]]
+                            items["angle"] = line[all_maps["angle"]]
+                            items["coordx"] = line[all_maps["coordx"]]
+                            items["coordy"] = line[all_maps["coordy"]]
+                    if edit:
+                        if line[all_maps['visual_file']] != 'none': continue  # TODO: for now, we only support one image, no visual clue
+                        items['edit_instruction'] = line[all_maps['edit_instruction']]
+                        items['edited_file'] = line[all_maps['edited_file']]
+                    all_lines.append(items)
+                except Exception as e:
+                    skipped += 1; continue
+                image_bucket = all_lines[-1]['image_bucket']
+                self.image_buckets[image_bucket] += 1
+                if all_lines[-1]['tar'] not in self.buckets:
+                    self.buckets[all_lines[-1]['tar']] = bucket
+        if "force_caption" in all_lines[0]:
+            wind_forces = [l["wind_speed"] for l in all_lines] if "wind_speed" in all_lines[0] else [l["force"] for l in all_lines]
+            self.min_wind_force = min(wind_forces)
+            self.max_wind_force = max(wind_forces)
+        self.use_image_bucket = use_image_bucket
+        self.all_lines = all_lines[rank:][::world_size]   # all lines is sorted by tar file
+        self.num_samples_per_rank = None
+        self.image_size = image_size
+        self.multiple = multiple
+        self.temporal_size = tuple(map(int, temporal_size.split(':'))) if isinstance(temporal_size, str) else None
+        self.edit_mode = edit
+        def center_crop_resize(img, ratio="1:1", target_size: int = 256, multiple: int = 8):
+            """
+            1. Center crop `img` to the largest window with aspect ratio = ratio.
+            2. Resize so  HxW ≈ target_size²  (each side a multiple of `multiple`).
+            Args
+            ----
+            img         : PIL Image or torch tensor (CHW/HWC)
+            ratio       : "3:2", (3,2), "1:1", etc.
+            target_size : reference side length (area = target_size²)
+            multiple    : force each output side to be a multiple of this number
+            """
+            # --- parse ratio ----------------------------------------------------------
+            if isinstance(ratio, str):
+                rw, rh = map(int, ratio.split(':'))
+            else:                                 # already a tuple/list
+                rw, rh = ratio
+            R = rw / rh                           # width / height
+            # --- crop to that aspect ratio -------------------------------------------
+            w, h = img.size if hasattr(img, "size") else (img.shape[-1], img.shape[-2])
+            if w / h > R:                         # image too wide → trim width
+                crop_h, crop_w = h, int(round(h * R))
+            else:                                 # image too tall → trim height
+                crop_w, crop_h = w, int(round(w / R))
+            img = transforms.functional.center_crop(img, (crop_h, crop_w))
+            # --- compute output dimensions -------------------------------------------
+            area  = target_size ** 2
+            out_h = _nearest_multiple(math.sqrt(area / R), multiple)
+            out_w = _nearest_multiple(math.sqrt(area * R), multiple)
+            # --- resize & return ------------------------------------------------------
+            return transforms.functional.resize(img, (out_h, out_w), antialias=True)
+        self.transforms = {}
+        self.size_bucket_maps = {}
+        self.bucket_size_maps = {}
+        for bucket in self.image_buckets:
+            trans = [transforms.Lambda(lambda img, r=bucket: center_crop_resize(img, ratio=r, target_size=image_size, multiple=multiple))]
+            if not no_flip:
+                trans.append(transforms.RandomHorizontalFlip())
+            trans.extend([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
+            self.transforms[bucket] = transforms.Compose(trans)
+            w, h = map(int, bucket.split(':'))
+            out_h, out_w = aspect_ratio_to_image_size(image_size, w / h, multiple=multiple)
+            self.size_bucket_maps[(out_h, out_w)] = bucket
+            self.bucket_size_maps[bucket] = (out_h, out_w)
+        self.transform = self.transforms['1:1']  # default transform
+        print(f"Rank0 -- Loading {len(self.all_lines)} lines of data | {skipped} lines are skipped due to size or error")
+    def __len__(self):
+        if self.num_samples_per_rank is not None:
+            return self.num_samples_per_rank
+        return len(self.all_lines)
+    def __getitem__(self, idx):
+        image_item = self.all_lines[idx]
+        tar_file = image_item['tar']
+        img_file = image_item['file']
+        img_bucket = image_item['image_bucket']
+        try:
+            with tarfile.open(tar_file, mode='r') as tar:
+                img = self._read_image(tar, img_file, img_bucket)
+                H0, W0 = img.size
+                scale  = self.image_size / min(H0, W0)
+                state  = np.array([scale, H0, W0])
+        except Exception as e:
+            print(f'Reading data error {e}')
+        sample = image_item.copy()
+        sample.update(image=img, state=state)
+        return sample
+    def _read_image(self, tar, img_file, img_bucket):
+        def _transform(img):
+            if not self.use_image_bucket:
+                return self.transform(img)
+            else:
+                return self.transforms[img_bucket](img)
+        x_shape = aspect_ratio_to_image_size(self.image_size, img_bucket, multiple=self.multiple)
+        if self.temporal_size is not None:  # read video
+            num_frames, out_fps = self.temporal_size[0], self.temporal_size[1:]
+            if len(out_fps) == 1:
+                out_fps = out_fps[0]
+            else:
+                out_fps = random.choice(out_fps)  # randomly choose one fps from the list
+            assert img_file.endswith('.mp4'), "Only support mp4 video for now"
+            try:
+                with tar.extractfile(img_file) as video_data:
+                    with ram_temp_file(video_data.read()) as tmp_path:
+                        frames, frame_inds = sample_clip(tmp_path, num_frames=num_frames, out_fps=out_fps)
+                        frames = frames.asnumpy()
+            except Exception as e:
+                print(f'Reading data error {e} {img_file}')
+                frames = np.zeros((num_frames, x_shape[0], x_shape[1], 3), dtype=np.uint8)
+            return torch.stack([_transform(Image.fromarray(frame)) for frame in frames]), out_fps, frame_inds
+        try:
+            original_img = Image.open(tar.extractfile(img_file)).convert('RGB')
+        except Exception as e:
+            print(f'Reading data error {e} {img_file}')
+            original_img = Image.new('RGB', (x_shape[0], x_shape[1]), (0, 0, 0))
+        return _transform(original_img), 0, None
+    def collate_fn(self, batch):
+        batch = default_collate(batch)
+        return batch
+    def get_batch_modes(self, x):
+        x_aspect   = self.size_bucket_maps.get(x.size()[-2:], "1:1")
+        video_mode = self.temporal_size is not None
+        return x_aspect, video_mode
+class OnlineImageTarDataset(ImageTarDataset):
+    max_retry_n = 20
+    max_read = 4096
+    tar_keys_lock = manager.Lock() if manager is not None else None
+    def __init__(self, dataset_tsv, image_size, batch_size=None, **kwargs):
+        super().__init__(dataset_tsv, image_size, **kwargs)
+        self.tar_lists = defaultdict(lambda: [])
+        self.tar_image_buckets = defaultdict(lambda: defaultdict(lambda: 0))
+        for i, line in enumerate(self.all_lines):
+            tar_file = line['tar']
+            image_bucket = line['image_bucket']
+            self.tar_lists[tar_file] += [i]
+            self.tar_image_buckets[tar_file][image_bucket] += 1
+        self.reset_tar_keys = []
+        for key in self.tar_lists.keys():
+            repeat = int(self.weights.get(key, 1))
+            self.reset_tar_keys.extend([key] * repeat)
+        self.tar_keys = manager.list(self.reset_tar_keys) if manager is not None else list(self.reset_tar_keys)
+        # Use more workers for better prefetching, but limit to reasonable number
+        self.worker_executors = {}
+        self.worker_caches = {}  # each entry: {active:{tar,key,cnt,inner_idx}, prefetch:{future,key}}
+        self.worker_caches_lock = threading.Lock()  # Protect worker_caches access
+        self.shuffle_everything()
+        if self.use_image_bucket:
+            assert batch_size, "batch_size should be set when use_image_bucket is True"
+        self.batch_size = batch_size
+        if self.temporal_size is not None:
+            assert self.temporal_size[0] > 1, "temporal_size should be greater than 1 for video data"
+            self.max_read = 512
+    def cleanup_worker_cache(self, wid):
+        """Clean up worker cache entry and associated resources"""
+        with self.worker_caches_lock:
+            if wid in self.worker_caches:
+                cache_entry = self.worker_caches[wid]
+                # Cancel prefetch future if still running
+                if 'prefetch' in cache_entry and hasattr(cache_entry['prefetch'], 'cancel'):
+                    cache_entry['prefetch'].cancel()
+                if cache_entry.get('tar') is not None:
+                    tar = cache_entry['tar']
+                    self._close_tar(tar)
+                    cache_entry['tar'] = None
+                # Remove the entire cache entry
+                del self.worker_caches[wid]
+                gc.collect()
+    def _s3(self):
+        raise NotImplementedError("Please implement your own _s3() method to return a boto3 session/client")
+    def shuffle_everything(self):
+        for key in tqdm.tqdm(self.tar_keys):
+            random.shuffle(self.tar_lists[key])
+        random.shuffle(self.tar_keys)
+        print("shuffle everything done!")
+    def download_tar(self, prefetch=True, wid=None):
+        i = 0
+        file_stream = None
+        tar_file = None
+        download = f'prefetch {wid}' if prefetch else 'just download'
+        while True:
+            if i % self.max_retry_n == 0:  # retry a different tar file
+                tar_file = self._get_next_key()  # get the next tar file key
+            file_stream = None
+            try:
+                file_stream = io.BytesIO()
+                self._s3().download_fileobj(self.buckets[tar_file], tar_file, file_stream)  # hard-coded
+                file_stream.seek(0)
+                tar = tarfile.open(fileobj=file_stream, mode='r')
+                # Store the file_stream reference so it can be closed later
+                tar._file_stream = file_stream
+                xprint(f'[INFO] {download} tar file: {tar_file}')
+                return tar, tar_file
+            except Exception as e:
+                xprint(f'[ERROR] {download} tar file {tar_file} failed: {e}')
+                i += 1
+                if file_stream:
+                    file_stream.close()
+                    file_stream = None
+                time.sleep(min(i * 0.1, 5))  # Exponential backoff with cap
+    def _get_next_key(self):
+        with self.tar_keys_lock:
+            if not self.tar_keys or len(self.tar_keys) == 0:
+                xprint(f'[WARN] all dataset exhausted... this should not happen usually')
+                self.tar_keys.extend(list(self.reset_tar_keys))  # reset
+                random.shuffle(self.tar_keys)
+            return self.tar_keys.pop(0)  # remove and return the first key
+    def _start_prefetch(self, wid):
+        """Start prefetching the next tar file for the worker"""
+        # Create executor per worker process if it doesn't exist
+        if wid not in self.worker_executors:
+            self.worker_executors[wid] = ThreadPoolExecutor(max_workers=1)
+        future = self.worker_executors[wid].submit(self.download_tar, prefetch=True, wid=wid)  # download tar file in a separate thread
+        self.worker_caches[wid]['prefetch'] = future
+    def _close_tar(self, tar):
+        # Properly close both tar and underlying file stream
+        if hasattr(tar, '_file_stream') and tar._file_stream:
+            tar._file_stream.close()
+            tar._file_stream = None
+        tar.close()
+        del tar
+        gc.collect()
+    def __getitem__(self, idx):
+        try:
+            wid = get_worker_info().id
+        except Exception as e:
+            wid = -1
+        # ─── first time this worker is used ─── #
+        if wid not in self.worker_caches:
+            tar, key = self.download_tar(prefetch=False)  # download tar file
+            with self.worker_caches_lock:
+                self.worker_caches[wid] = dict(
+                    active=dict(tar=tar, key=key, cnt=0, inner_idx=0),  # active cache
+                )
+                self._start_prefetch(wid)  # start prefetching the next tar file
+        cache = self.worker_caches[wid]
+        active = cache['active']
+        tar = active['tar']
+        key = active['key']
+        cnt = active['cnt']
+        inner_idx = active['inner_idx']
+        # handle image bucketting
+        if self.use_image_bucket:
+            if inner_idx % self.batch_size == 0:
+                # sample based on local tar file statistics in case some dataset only has one image bucket
+                tar_buckets = self.tar_image_buckets[key]
+                target_image_bucket = random.choices(
+                    list(tar_buckets.keys()), weights=list(tar_buckets.values()), k=1)[0]
+                self.worker_caches[wid]['target_image_bucket'] = target_image_bucket
+            # scan the list to find the nearest target image bucket
+            target_image_bucket, t_cnt = self.worker_caches[wid]['target_image_bucket'], cnt
+            while self.all_lines[self.tar_lists[key][t_cnt]]['image_bucket'] != target_image_bucket:
+                t_cnt += 1
+                if t_cnt >= len(self.tar_lists[key]): t_cnt = 0
+            # sawp the image location
+            if cnt != t_cnt:
+                self.tar_lists[key][cnt], self.tar_lists[key][t_cnt] = self.tar_lists[key][t_cnt], self.tar_lists[key][cnt]
+        img_id = self.tar_lists[key][cnt]
+        image_item = self.all_lines[img_id]
+        sample = {key: image_item[key] for key in image_item}
+        image, fps, frame_inds = self._read_image(tar, image_item['file'], image_item['image_bucket'])
+        sample.update(image=image, fps=fps, local_idx=img_id, inner_idx=inner_idx)
+        if self.edit_mode:
+            image, fps, _ = self._read_image(tar, image_item['edited_file'], image_item['image_bucket'])
+            sample.update(edited_image=image, fps=fps, edit_instruction=image_item['edit_instruction'])
+        if "camera_file" in image_item: # dl3dv data
+            sample["condition"] = get_camera_condition(tar, image_item["camera_file"], width=image.shape[3], height=image.shape[2], factor=self.multiple, frame_inds=frame_inds)
+        if "force_caption" in image_item: # force dataset
+            if "wind_speed" in image_item: # wind force
+                sample["condition"] = get_wind_condition(image_item["wind_speed"], image_item["wind_angle"], min_force=self.min_wind_force, max_force=self.max_wind_force, num_frames=image.shape[1], width=image.shape[3], height=image.shape[2])
+            elif "force" in image_item: # point-wise
+                sample["condition"] = get_point_condition(image_item["force"], image_item["angle"], image_item["coordx"], image_item["coordy"], min_force=self.min_wind_force, max_force=self.max_wind_force, num_frames=image.shape[1], width=image.shape[3], height=image.shape[2])
+        # update cnt
+        cnt, inner_idx = cnt + 1, inner_idx + 1
+        if (cnt == len(self.tar_lists[key])) or (cnt == self.max_read):
+            # -- active tar finished, switch to prefetched tar -- #
+            self._close_tar(tar)  # close the current tar file
+            try:
+                # Wait for prefetch with timeout
+                new_tar, new_key = cache['prefetch'].result()  # 5 minute timeout
+            except Exception as e:
+                xprint(f'[WARN] Prefetch failed, downloading new tar synchronously: {e}')
+                new_tar, new_key = self.download_tar(prefetch=False)
+            cache['active'] = dict(tar=new_tar, key=new_key, cnt=0, inner_idx=inner_idx)  # update active cache
+            # shuffle the image list
+            random.shuffle(self.tar_lists[key])  # shuffle the list
+            with self.tar_keys_lock:
+                self.tar_keys.append(key)  # return the key to the list so other workers can use it
+            self._start_prefetch(wid)  # start prefetching the next tar file
+        else:
+            cache['active']['cnt'] = cnt
+        # always update inner_idx (IMPORTANT)
+        cache['active']['inner_idx'] = inner_idx
+        return sample
+class OnlineImageCaptionDataset(OnlineImageTarDataset):
+    def __getitem__(self, idx):
+        sample = super().__getitem__(idx)
+        captions, caption_op = sample['caption']
+        if caption_op == 'none':
+            sample['caption'] = captions[0] if isinstance(captions, list) else captions
+        elif ':' in caption_op:
+            sample['caption'] = random.choices(captions, weights=[float(a) for a in caption_op.split(':')])[0]
+        else:
+            raise NotImplementedError(f"Unknown caption operation: {caption_op}")
+        return sample
+    def collate_fn(self, batch):
+        batch = super().collate_fn(batch)
+        image = batch['image']
+        caption = batch['caption']
+        if self.edit_mode:
+            image = torch.cat([image, batch['edited_image']], dim=0)
+            caption.extend(batch['edit_instruction'])
+        meta = {key: batch[key] for key in batch if key not in
+                ['image', 'caption', 'edited_image', 'edit_instruction']}
+        return image, caption, meta
+# ==== Dummy Dataset Implementation for Open Source Release ====
+class DummyImageCaptionDataset(Dataset):
+    """
+    Dummy dataset that generates synthetic image-caption pairs for training/testing.
+    Supports mixed aspect ratios and batch-wise aspect ratio consistency.
+    """
+    def __init__(
+        self,
+        num_samples: int = 10000,
+        image_size: int = 256,
+        temporal_size: Optional[str] = None,
+        use_image_bucket: bool = False,
+        batch_size: Optional[int] = None,
+        multiple: int = 8,
+        no_flip: bool = False,
+        edit: bool = False
+    ):
+        """
+        Args:
+            num_samples: Number of samples in the dataset
+            image_size: Base image size for generation
+            temporal_size: Video size specification (e.g., "16:8" for frames:fps)
+            use_image_bucket: Whether to use aspect ratio bucketing
+            batch_size: Batch size for bucketing (required if use_image_bucket=True)
+            multiple: Multiple for dimension rounding
+            no_flip: Whether to disable horizontal flipping
+            edit: Whether this is an editing dataset
+        """
+        self.num_samples = num_samples
+        self.image_size = image_size
+        self.temporal_size = temporal_size
+        self.use_image_bucket = use_image_bucket
+        self.batch_size = batch_size
+        self.multiple = multiple
+        self.no_flip = no_flip
+        self.edit_mode = edit
+        # Parse video parameters
+        self.is_video = temporal_size is not None
+        if self.is_video:
+            frames, fps = map(int, temporal_size.split(':'))
+            self.num_frames = frames
+            self.fps = fps
+        else:
+            self.num_frames = 1
+            self.fps = None
+        # Aspect ratios for mixed aspect ratio training
+        self.aspect_ratios = [
+            "1:1", "2:3", "3:2", "16:9", "9:16",
+            "4:5", "5:4", "21:9", "9:21"
+        ] if use_image_bucket else ["1:1"]
+        # Generate image buckets for aspect ratios
+        self.image_buckets = {}
+        for i, ar in enumerate(self.aspect_ratios):
+            h, w = aspect_ratio_to_image_size(image_size, ar, multiple)
+            self.image_buckets[ar] = (h, w, ar)
+        # Sample captions for dummy data
+        self.sample_captions = [
+            "A beautiful landscape with mountains and trees",
+            "A cute cat sitting on a wooden table",
+            "A modern city skyline at sunset",
+            "A vintage car parked on a street",
+            "A delicious meal on a white plate",
+            "A person walking in a park",
+            "A colorful flower garden in bloom",
+            "A cozy living room with furniture",
+            "A stormy ocean with large waves",
+            "A peaceful forest path in autumn",
+            "A group of friends laughing together",
+            "A majestic eagle flying in the sky",
+            "A busy marketplace with vendors",
+            "A snow-covered mountain peak",
+            "A child playing with toys",
+            "A romantic candlelit dinner",
+            "A train traveling through countryside",
+            "A lighthouse on a rocky coast",
+            "A field of sunflowers under blue sky",
+            "A family having a picnic outdoors"
+        ]
+        # Create transform pipeline
+        def center_crop_resize(img, ratio="1:1", target_size: int = 256, multiple: int = 8):
+            """
+            1. Center crop `img` to the largest window with aspect ratio = ratio.
+            2. Resize so  HxW ≈ target_size²  (each side a multiple of `multiple`).
+            Args
+            ----
+            img         : PIL Image or torch tensor (CHW/HWC)
+            ratio       : "3:2", (3,2), "1:1", etc.
+            target_size : reference side length (area = target_size²)
+            multiple    : force each output side to be a multiple of this number
+            """
+            # --- parse ratio ----------------------------------------------------------
+            if isinstance(ratio, str):
+                rw, rh = map(int, ratio.split(':'))
+            else:                                 # already a tuple/list
+                rw, rh = ratio
+            R = rw / rh                           # width / height
+            # --- crop to that aspect ratio -------------------------------------------
+            w, h = img.size if hasattr(img, "size") else (img.shape[-1], img.shape[-2])
+            if w / h > R:                         # image too wide → trim width
+                crop_h, crop_w = h, int(round(h * R))
+            else:                                 # image too tall → trim height
+                crop_w, crop_h = w, int(round(w / R))
+            img = transforms.functional.center_crop(img, (crop_h, crop_w))
+            # --- compute output dimensions -------------------------------------------
+            area  = target_size ** 2
+            out_h = _nearest_multiple(math.sqrt(area / R), multiple)
+            out_w = _nearest_multiple(math.sqrt(area * R), multiple)
+            # --- resize & return ------------------------------------------------------
+            return transforms.functional.resize(img, (out_h, out_w), antialias=True)
+        self.transforms = {}
+        self.size_bucket_maps = {}
+        self.bucket_size_maps = {}
+        for bucket in self.image_buckets:
+            trans = [transforms.Lambda(lambda img, r=bucket: center_crop_resize(img, ratio=r, target_size=image_size, multiple=multiple))]
+            if not no_flip:
+                trans.append(transforms.RandomHorizontalFlip())
+            trans.extend([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
+            self.transforms[bucket] = transforms.Compose(trans)
+            w, h = map(int, bucket.split(':'))
+            out_h, out_w = aspect_ratio_to_image_size(image_size, w / h, multiple=multiple)
+            self.size_bucket_maps[(out_h, out_w)] = bucket
+            self.bucket_size_maps[bucket] = (out_h, out_w)
+        self.transform = self.transforms['1:1']  # default transform
+    def __len__(self) -> int:
+        return self.num_samples
+    def __getitem__(self, idx: int) -> dict:
+        """Get a single sample from the dataset."""
+        # Choose aspect ratio
+        if self.use_image_bucket:
+            bucket_name = random.choice(list(self.image_buckets.keys()))
+            h, w, aspect_ratio = self.image_buckets[bucket_name]
+        else:
+            h, w, aspect_ratio = self.image_size, self.image_size, "1:1"
+            bucket_name = aspect_ratio
+        # Generate dummy image
+        if self.is_video:
+            # Generate video tensor (T, C, H, W)
+            image = torch.randn(self.num_frames, 3, h, w)
+            # Normalize to [-1, 1] range
+            image = torch.tanh(image)
+        else:
+            # Generate RGB image
+            image = Image.new('RGB', (w, h), color=(
+                random.randint(50, 200),
+                random.randint(50, 200),
+                random.randint(50, 200)
+            ))
+            # Add some random patterns for variety
+            if random.random() > 0.5:
+                # Add gradient
+                pixels = []
+                for y in range(h):
+                    for x in range(w):
+                        r = int(255 * x / w)
+                        g = int(255 * y / h)
+                        b = int(255 * (x + y) / (w + h))
+                        pixels.append((r, g, b))
+                image.putdata(pixels)
+            image = self.transform(image)
+        # Generate caption
+        caption = random.choice(self.sample_captions)
+        # Add some variation to captions
+        if random.random() > 0.7:
+            adjectives = ["beautiful", "stunning", "amazing", "incredible", "magnificent"]
+            caption = f"{random.choice(adjectives)} {caption.lower()}"
+        sample = {
+            'image': image,
+            'caption': caption,
+            'image_bucket': bucket_name,
+            'aspect_ratio': aspect_ratio,
+            'idx': idx
+        }
+        # Add video-specific metadata
+        if self.is_video:
+            sample.update({
+                'num_frames': self.num_frames,
+                'fps': self.fps,
+                'temporal_size': self.temporal_size
+            })
+        # Add editing data if needed
+        if self.edit_mode:
+            # Generate slightly modified image for editing tasks
+            edited_image = image + torch.randn_like(image) * 0.1
+            edited_image = torch.clamp(edited_image, -1, 1)
+            sample.update({
+                'edited_image': edited_image,
+                'edit_instruction': f"Edit this image to make it more {random.choice(['colorful', 'bright', 'artistic', 'realistic'])}"
+            })
+        return sample
+    def collate_fn(self, batch: list) -> tuple:
+        """Collate function for batching samples."""
+        # Group by aspect ratio if using image buckets
+        if self.use_image_bucket:
+            # Sort batch by image bucket for consistency
+            batch = sorted(batch, key=lambda x: x['image_bucket'])
+        # Standard collation
+        collated = {}
+        images = torch.stack([item['image'] for item in batch], dim=0)
+        captions = [item['caption'] for item in batch]
+        # Collect metadata
+        for key in ['image_bucket', 'aspect_ratio', 'idx']:
+            if key in batch[0]:
+                collated[key] = [item[key] for item in batch]
+        # Handle video metadata
+        if self.is_video:
+            for key in ['num_frames', 'fps', 'temporal_size']:
+                if key in batch[0]:
+                    collated[key] = [item[key] for item in batch]
+        # Handle editing data
+        if self.edit_mode and 'edited_image' in batch[0]:
+            edited_images = torch.stack([item['edited_image'] for item in batch], dim=0)
+            collated['edited_image'] = edited_images
+            collated['edit_instruction'] = [item['edit_instruction'] for item in batch]
+        return images, captions, collated
+    def get_batch_modes(self, x):
+        x_aspect   = self.size_bucket_maps.get(x.size()[-2:], "1:1")
+        video_mode = self.temporal_size is not None
+        return x_aspect, video_mode
+class DummyDataLoaderWrapper:
+    """
+    Wrapper that mimics the DataLoaderWrapper functionality.
+    Provides infinite iteration over the dataset.
+    """
+    def __init__(self, dataset, batch_size=1, num_workers=0, **kwargs):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.dataloader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            collate_fn=dataset.collate_fn,
+            shuffle=True,
+            drop_last=True,
+            **kwargs
+        )
+        self.iterator = None
+        self.secondary_loader = None
+    def __iter__(self):
+        """Infinite iteration over the dataset."""
+        while True:
+            if self.iterator is None:
+                self.iterator = iter(self.dataloader)
+            try:
+                yield next(self.iterator)
+            except StopIteration:
+                self.iterator = iter(self.dataloader)
+                yield next(self.iterator)
+    def __len__(self):
+        return len(self.dataloader)
+def create_dummy_dataloader(
+    dataset_name: str,
+    img_size: int,
+    vid_size: Optional[str] = None,
+    batch_size: int = 16,
+    use_mixed_aspect: bool = False,
+    multiple: int = 8,
+    num_samples: int = 10000,
+    infinite: bool = False
+) -> Union[DataLoader, DummyDataLoaderWrapper]:
+    """
+    Create a dummy dataloader that mimics the original functionality.
+    Args:
+        dataset_name: Name of the dataset (used for deterministic seeding)
+        img_size: Base image size
+        vid_size: Video specification (e.g., "16:8")
+        batch_size: Batch size
+        use_mixed_aspect: Whether to use mixed aspect ratio training
+        multiple: Multiple for dimension rounding
+        num_samples: Number of samples in the dataset
+        infinite: Whether to create infinite dataloader
+    Returns:
+        DataLoader or DummyDataLoaderWrapper
+    """
+    # Set seed based on dataset name for reproducibility
+    seed = hash(dataset_name) % (2**32 - 1)
+    random.seed(seed)
+    np.random.seed(seed)
+    # Create dataset
+    dataset = DummyImageCaptionDataset(
+        num_samples=num_samples,
+        image_size=img_size,
+        temporal_size=vid_size,
+        use_image_bucket=use_mixed_aspect,
+        batch_size=batch_size,
+        multiple=multiple,
+        edit='edit' in dataset_name.lower()
+    )
+    # Set dataset attributes expected by training code
+    dataset.total_num_samples = num_samples
+    dataset.num_samples_per_rank = num_samples
+    # Create dataloader
+    if infinite:
+        return DummyDataLoaderWrapper(
+            dataset,
+            batch_size=batch_size,
+            num_workers=2,
+            pin_memory=True,
+            drop_last=True,
+            persistent_workers=True
+        )
+    else:
+        return DataLoader(
+            dataset,
+            batch_size=batch_size,
+            num_workers=2,
+            pin_memory=True,
+            drop_last=True,
+            shuffle=True,
+            collate_fn=dataset.collate_fn,
+            persistent_workers=True
+        )