# Copyright (C) 2024-present Naver Corporation. All rights reserved. # Licensed under CC BY-NC-SA 4.0 (non-commercial use only). # # -------------------------------------------------------- # utilitary functions about images (loading/converting...) # -------------------------------------------------------- import os import torch import numpy as np import PIL.Image from tqdm import tqdm from PIL.ImageOps import exif_transpose import torchvision.transforms as tvf os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" import cv2 # noqa try: from pillow_heif import register_heif_opener # noqa register_heif_opener() heif_support_enabled = True except ImportError: heif_support_enabled = False from .geometry import depthmap_to_camera_coordinates import json ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) def imread_cv2(path, options=cv2.IMREAD_COLOR): """ Open an image or a depthmap with opencv-python. """ if path.endswith(('.exr', 'EXR')): options = cv2.IMREAD_ANYDEPTH img = cv2.imread(path, options) if img is None: raise IOError(f'Could not load image={path} with {options=}') if img.ndim == 3: img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) return img def rgb(ftensor, true_shape=None): if isinstance(ftensor, list): return [rgb(x, true_shape=true_shape) for x in ftensor] if isinstance(ftensor, torch.Tensor): ftensor = ftensor.detach().cpu().numpy() # H,W,3 if ftensor.ndim == 3 and ftensor.shape[0] == 3: ftensor = ftensor.transpose(1, 2, 0) elif ftensor.ndim == 4 and ftensor.shape[1] == 3: ftensor = ftensor.transpose(0, 2, 3, 1) if true_shape is not None: H, W = true_shape ftensor = ftensor[:H, :W] if ftensor.dtype == np.uint8: img = np.float32(ftensor) / 255 else: img = (ftensor * 0.5) + 0.5 return img.clip(min=0, max=1) def _resize_pil_image(img, long_edge_size): S = max(img.size) if S > long_edge_size: interp = PIL.Image.LANCZOS elif S <= long_edge_size: interp = PIL.Image.BICUBIC new_size = tuple(int(round(x*long_edge_size/S)) for x in img.size) return img.resize(new_size, interp) def load_images(folder_or_list, size, square_ok=False, verbose=1, img_num=0, img_freq=0, postfix=None, start_idx=0): """ open and convert all images in a list or folder to proper input format for DUSt3R """ if isinstance(folder_or_list, str): if verbose > 0: print(f'>> Loading images from {folder_or_list}') img_names = [name for name in os.listdir(folder_or_list) if not "depth" in name] if postfix is not None: img_names = [name for name in img_names if name.endswith(postfix)] root, folder_content = folder_or_list, img_names elif isinstance(folder_or_list, list): if verbose > 0: print(f'>> Loading a list of {len(folder_or_list)} images') root, folder_content = '', folder_or_list else: raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})') # sort images by number in name len_postfix = len(postfix) if postfix is not None \ else len(folder_content[0]) - folder_content[0].rfind('.') img_numbers = [] for name in folder_content: dot_index = len(name) - len_postfix number_start = 0 for i in range(dot_index-1, 0, -1): if not name[i].isdigit(): number_start = i + 1 break img_numbers.append(float(name[number_start:dot_index])) folder_content = [x for _, x in sorted(zip(img_numbers, folder_content))] if start_idx > 0: folder_content = folder_content[start_idx:] if(img_freq > 0): folder_content = folder_content[::img_freq] if(img_num > 0): folder_content = folder_content[:img_num] # print(root, folder_content) supported_images_extensions = ['.jpg', '.jpeg', '.png'] if heif_support_enabled: supported_images_extensions += ['.heic', '.heif'] supported_images_extensions = tuple(supported_images_extensions) imgs = [] if verbose > 0: folder_content = tqdm(folder_content, desc='Loading images') for path in folder_content: if not path.lower().endswith(supported_images_extensions): continue img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB') W1, H1 = img.size if size == 224: # resize short side to 224 (then crop) img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1))) else: # resize long side to 512 img = _resize_pil_image(img, size) W, H = img.size cx, cy = W//2, H//2 if size == 224: half = min(cx, cy) img = img.crop((cx-half, cy-half, cx+half, cy+half)) else: halfw, halfh = ((2*cx)//16)*8, ((2*cy)//16)*8 if not (square_ok) and W == H: halfh = 3*halfw/4 img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh)) W2, H2 = img.size if verbose > 1: print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}') imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32( [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs)), label=path)) assert imgs, 'no images foud at '+ root if verbose > 0: print(f' ({len(imgs)} images loaded)') return imgs def load_single_image(frame_bgr: np.ndarray, size: int = 224, square_ok: bool = False, device: str = 'cpu') -> dict: """ Process a single frame given as a NumPy array, following the same logic as the original load_images function. :param frame_bgr: Input NumPy image array (H, W, 3), must be in OpenCV's default BGR order. :param size: Target size, typically 224. :param square_ok: Whether to allow square output (when size is not 224). :param device: Device to place the output Tensor ('cpu' or 'cuda'). :return: A standard dictionary containing the processed image information. """ img_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) img = PIL.Image.fromarray(img_rgb) img = PIL.ImageOps.exif_transpose(img) W1, H1 = img.size if size == 224: if W1 < H1: new_w = size new_h = round(size * H1 / W1) else: new_h = size new_w = round(size * W1 / H1) resized_img = img.resize((new_w, new_h), PIL.Image.Resampling.LANCZOS) else: if W1 < H1: new_h = size new_w = round(size * W1 / H1) else: new_w = size new_h = round(size * H1 / W1) resized_img = img.resize((new_w, new_h), PIL.Image.Resampling.LANCZOS) W, H = resized_img.size cx, cy = W // 2, H // 2 if size == 224: half = size // 2 cropped_img = resized_img.crop((cx - half, cy - half, cx + half, cy + half)) else: halfw = (cx // 16) * 8 halfh = (cy // 16) * 8 if not square_ok and W == H: halfh = 3 * halfw // 4 cropped_img = resized_img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh)) W2, H2 = cropped_img.size img_tensor = ImgNorm(cropped_img)[None].to(device) processed_dict = dict( img=img_tensor, true_shape=torch.tensor([H2, W2], dtype=torch.int32).to(device), idx=0, instance='0', label='single_frame' ) return processed_dict def crop_and_resize(image, depthmap, intrinsics, long_size, rng=None, info=None, use_crop=False): """ This function: 1. 将图片crop,使得其principal point真正落在中间 2. 根据图片横竖确定target resolution的横竖 """ import slam3r.datasets.utils.cropping as cropping if not isinstance(image, PIL.Image.Image): image = PIL.Image.fromarray(image) W, H = image.size cx, cy = intrinsics[:2, 2].round().astype(int) if(use_crop): # downscale with lanczos interpolation so that image.size == resolution # cropping centered on the principal point min_margin_x = min(cx, W-cx) min_margin_y = min(cy, H-cy) assert min_margin_x > W/5, f'Bad principal point in view={info}' assert min_margin_y > H/5, f'Bad principal point in view={info}' # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy) l, t = cx - min_margin_x, cy - min_margin_y r, b = cx + min_margin_x, cy + min_margin_y crop_bbox = (l, t, r, b) image, depthmap, intrinsics = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox) # transpose the resolution if necessary W, H = image.size # new size scale = long_size / max(W, H) # high-quality Lanczos down-scaling target_resolution = np.array([W, H]) * scale image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution) return image, depthmap, intrinsics def load_scannetpp_images_pts3dcam(folder_or_list, size, square_ok=False, verbose=True, img_num=0, img_freq=0): """ open and convert all images in a list or folder to proper input format for DUSt3R """ if isinstance(folder_or_list, str): if verbose: print(f'>> Loading images from {folder_or_list}') root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list)) elif isinstance(folder_or_list, list): if verbose: print(f'>> Loading a list of {len(folder_or_list)} images') root, folder_content = '', folder_or_list else: raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})') if(img_freq > 0): folder_content = folder_content[1000::img_freq] if(img_num > 0): folder_content = folder_content[:img_num] supported_images_extensions = ['.jpg', '.jpeg', '.png'] if heif_support_enabled: supported_images_extensions += ['.heic', '.heif'] supported_images_extensions = tuple(supported_images_extensions) imgs = [] intrinsic_path = os.path.join(os.path.dirname(root), 'pose_intrinsic_imu.json') with open(intrinsic_path, 'r') as f: info = json.load(f) for path in folder_content: if not path.lower().endswith(supported_images_extensions): continue img_path = os.path.join(root, path) img = exif_transpose(PIL.Image.open(img_path)).convert('RGB') W1, H1 = img.size depth_path = img_path.replace('.jpg', '.png').replace('rgb','depth') depthmap = imread_cv2(depth_path, cv2.IMREAD_UNCHANGED) depthmap = depthmap.astype(np.float32) / 1000. """ img and depth has different convention about shape """ # print(img.size, depthmap.shape) depthmap = cv2.resize(depthmap, (W1,H1), interpolation=cv2.INTER_CUBIC) # print(img.size, depthmap.shape) img_id = os.path.basename(img_path)[:-4] intrinsics = np.array(info[img_id]['intrinsic']) # print(img, depthmap, intrinsics) img, depthmap, intrinsics = crop_and_resize(img, depthmap, intrinsics, size) # print(img, depthmap, intrinsics) pts3d_cam, mask = depthmap_to_camera_coordinates(depthmap, intrinsics) pts3d_cam = pts3d_cam * mask[..., None] # print(pts3d_cam.shape) valid_mask = np.isfinite(pts3d_cam).all(axis=-1) W2, H2 = img.size if verbose: print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}') imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32([img.size[::-1]]), idx=len(imgs), instance=str(len(imgs)), pts3d_cam=pts3d_cam[None], valid_mask=valid_mask[None] )) # break assert imgs, 'no images foud at '+root if verbose: print(f' (Found {len(imgs)} images)') return imgs