Spaces:

hzxie
/

city-dreamer

Sleeping

App Files Files Community

hzxie commited on Mar 3, 2024

Commit

79df973

verified ·

1 Parent(s): 085810c

feat: citydreamer inference (bugs to be fixed).

Browse files

Files changed (20) hide show

app.py +58 -38
citydreamer/__init__.py +0 -0
citydreamer/extensions/__init__.py +8 -0
citydreamer/extensions/extrude_tensor/__init__.py +40 -0
citydreamer/extensions/extrude_tensor/bindings.cpp +39 -0
citydreamer/extensions/extrude_tensor/extrude_tensor_ext.cu +67 -0
citydreamer/extensions/extrude_tensor/setup.py +26 -0
citydreamer/extensions/extrude_tensor/test.py +124 -0
citydreamer/extensions/grid_encoder/__init__.py +193 -0
citydreamer/extensions/grid_encoder/bindings.cpp +40 -0
citydreamer/extensions/grid_encoder/grid_encoder_ext.cu +605 -0
citydreamer/extensions/grid_encoder/setup.py +39 -0
citydreamer/extensions/voxlib/__init__.py +5 -0
citydreamer/extensions/voxlib/ray_voxel_intersection.cu +351 -0
citydreamer/extensions/voxlib/setup.py +25 -0
citydreamer/extensions/voxlib/voxlib.cpp +21 -0
citydreamer/extensions/voxlib/voxlib_common.h +83 -0
citydreamer/inference.py +537 -0
citydreamer/model.py +1264 -0
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -4,80 +4,90 @@
 # @Author: Haozhe Xie
 # @Date:   2024-03-02 16:30:00
 # @Last Modified by: Haozhe Xie
-# @Last Modified at: 2024-03-03 10:39:25
 # @Email:  [email protected]
 import logging
 import os
-import torch
-import gradio as gr
-import subprocess
-import urllib.request
 import ssl
 import sys
 # Fix: ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed
 ssl._create_default_https_context = ssl._create_unverified_context
-sys.path.append(os.path.join(os.path.dirname(__file__), "citydreamer"))
 # Import CityDreamer modules
-# import citydreamer.model
-# import citydreamer.inference
 def setup_runtime_env():
-    subprocess.call(["pip", "freeze"])
     ext_dir = os.path.join(os.path.dirname(__file__), "citydreamer", "extensions")
     for e in os.listdir(ext_dir):
-        if not os.path.isdir(e):
             continue
-        subprocess.call(["pip", "install", "."], workdir=os.path.join(ext_dir, e))
-def get_models():
-    if not os.path.exists("CityDreamer-Fgnd.pth"):
-        urllib.request.urlretrieve(
-            "https://huggingface.co/hzxie/city-dreamer/resolve/main/CityDreamer-Fgnd.pth",
-            "CityDreamer-Fgnd.pth",
-        )
-    if not os.path.exists("CityDreamer-Bgnd.pth"):
         urllib.request.urlretrieve(
-            "https://huggingface.co/hzxie/city-dreamer/resolve/main/CityDreamer-Bgnd.pth",
-            "CityDreamer-Bgnd.pth",
         )
-    bgm_ckpt = torch.load("CityDreamer-Bgnd.pth")
-    fgm_ckpt = torch.load("CityDreamer-Fgnd.pth")
-    bgm = citydreamer.model.GanCraftGenerator(bgm_ckpt["cfg"])
-    fgm = citydreamer.model.GanCraftGenerator(fgm_ckpt["cfg"])
     if torch.cuda.is_available():
-        fgm = torch.nn.DataParallel(fgm).cuda().eval()
-        bgm = torch.nn.DataParallel(bgm).cuda().eval()
-    return bgm, fgm
 def get_generated_city(radius, altitude, azimuth):
-    print(radius, altitude, azimuth)
 def main(debug):
     title = "CityDreamer Demo 🏙️"
     with open("README.md", "r") as f:
         markdown = f.read()
-        desc = markdown[markdown.rfind("---") + 3:]
     with open("ARTICLE.md", "r") as f:
         arti = f.read()
     app = gr.Interface(
         get_generated_city,
         [
-            gr.Slider(
-                128, 512, value=320, step=5, label="Camera Radius (m)"
-            ),
-            gr.Slider(
-                256, 512, value=384, step=5, label="Camera Altitude (m)"
-            ),
             gr.Slider(0, 360, value=180, step=5, label="Camera Azimuth (°)"),
         ],
         [gr.Image(type="numpy", label="Generated City")],
@@ -94,9 +104,19 @@ if __name__ == "__main__":
     logging.basicConfig(
         format="[%(levelname)s] %(asctime)s %(message)s", level=logging.INFO
     )
-    logging.info("Compile CUDA extensions...")
     # setup_runtime_env()
     logging.info("Downloading pretrained models...")
-    # fgm, bgm = get_models()
     logging.info("Starting the main application...")
     main(os.getenv("DEBUG") == "1")

 # @Author: Haozhe Xie
 # @Date:   2024-03-02 16:30:00
 # @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-03-03 12:02:23
 # @Email:  [email protected]
+import gradio as gr
 import logging
+import numpy as np
 import os
 import ssl
+import subprocess
 import sys
+import torch
+import urllib.request
+from PIL import Image
 # Fix: ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed
 ssl._create_default_https_context = ssl._create_unverified_context
 # Import CityDreamer modules
+sys.path.append(os.path.join(os.path.dirname(__file__), "citydreamer"))
 def setup_runtime_env():
+    logging.info("CUDA version is %s" % subprocess.check_output(["nvcc", "--version"]))
+    logging.info("GCC version is %s" % subprocess.check_output(["g++", "--version"]))
+    # Compile CUDA extensions
     ext_dir = os.path.join(os.path.dirname(__file__), "citydreamer", "extensions")
     for e in os.listdir(ext_dir):
+        if not os.path.isdir(os.path.join(ext_dir, e)):
             continue
+        subprocess.call(["pip", "install", "."], cwd=os.path.join(ext_dir, e))
+def get_models(file_name):
+    import citydreamer.model
+    if not os.path.exists(file_name):
         urllib.request.urlretrieve(
+            "https://huggingface.co/hzxie/city-dreamer/resolve/main/%s" % file_name,
+            file_name,
         )
+    ckpt = torch.load(file_name)
+    model = citydreamer.model.GanCraftGenerator(ckpt["cfg"])
     if torch.cuda.is_available():
+        model = torch.nn.DataParallel(model).cuda().eval()
+    return model
+def get_city_layout():
+    hf = np.array(Image.open("assets/NYC-HghtFld.png"))
+    seg = np.array(Image.open("assets/NYC-SegMap.png").convert("P"))
+    return hf, seg
 def get_generated_city(radius, altitude, azimuth):
+    # The import must be done after CUDA extension compilation
+    import citydreamer.inference
+    return citydreamer.inference.generate_city(
+        get_generated_city.fgm,
+        get_generated_city.bgm,
+        get_generated_city.hf,
+        get_generated_city.seg,
+        radius,
+        altitude,
+        azimuth,
+    )
 def main(debug):
     title = "CityDreamer Demo 🏙️"
     with open("README.md", "r") as f:
         markdown = f.read()
+        desc = markdown[markdown.rfind("---") + 3 :]
     with open("ARTICLE.md", "r") as f:
         arti = f.read()
     app = gr.Interface(
         get_generated_city,
         [
+            gr.Slider(128, 512, value=320, step=5, label="Camera Radius (m)"),
+            gr.Slider(256, 512, value=384, step=5, label="Camera Altitude (m)"),
             gr.Slider(0, 360, value=180, step=5, label="Camera Azimuth (°)"),
         ],
         [gr.Image(type="numpy", label="Generated City")],
     logging.basicConfig(
         format="[%(levelname)s] %(asctime)s %(message)s", level=logging.INFO
     )
+    logging.info("Compiling CUDA extensions...")
     # setup_runtime_env()
     logging.info("Downloading pretrained models...")
+    fgm = get_models("CityDreamer-Fgnd.pth")
+    bgm = get_models("CityDreamer-Bgnd.pth")
+    get_generated_city.fgm = fgm
+    get_generated_city.bgm = bgm
+    logging.info("Loading New York city layout to RAM...")
+    hf, seg = get_city_layout()
+    get_generated_city.hf = hf
+    get_generated_city.seg = seg
     logging.info("Starting the main application...")
     main(os.getenv("DEBUG") == "1")

citydreamer/__init__.py ADDED Viewed

File without changes

citydreamer/extensions/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   __init__.py
+# @Author: Haozhe Xie
+# @Date:   2023-03-24 20:23:53
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-03-24 20:23:55
+# @Email:  [email protected]

citydreamer/extensions/extrude_tensor/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   __init__.py
+# @Author: Haozhe Xie
+# @Date:   2023-03-24 20:24:38
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-06-16 09:55:58
+# @Email:  [email protected]
+import torch
+import extrude_tensor_ext
+class TensorExtruder(torch.nn.Module):
+    def __init__(self, max_height=256):
+        super(TensorExtruder, self).__init__()
+        self.max_height = max_height
+    def forward(self, seg_map, height_field):
+        assert torch.max(height_field) < self.max_height, "Max Value %d" % torch.max(
+            height_field
+        )
+        return ExtrudeTensorFunction.apply(seg_map, height_field, self.max_height)
+class ExtrudeTensorFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, seg_map, height_field, max_height):
+        # seg_map.shape: (B, C, H, W)
+        # height_field.shape: (B, C, H, W)
+        return extrude_tensor_ext.forward(seg_map, height_field, max_height)
+    @staticmethod
+    def backward(ctx, grad_volume):
+        # grad_volume.shape: (B, C, H, W, D)
+        # Combine the gradients along the Z-axis.
+        grad_seg_map = torch.sum(grad_volume, dim=4)
+        grad_height_field = grad_seg_map
+        return grad_seg_map, grad_height_field

citydreamer/extensions/extrude_tensor/bindings.cpp ADDED Viewed

	@@ -0,0 +1,39 @@

+/**
+ * @File:   extrude_tensor_ext_cuda.cpp
+ * @Author: Haozhe Xie
+ * @Date:   2023-03-26 11:06:13
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2023-03-26 16:28:20
+ * @Email:  [email protected]
+ */
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+torch::Tensor extrude_tensor_ext_cuda_forward(torch::Tensor seg_map,
+                                              torch::Tensor height_field,
+                                              int max_height,
+                                              cudaStream_t stream);
+torch::Tensor extrude_tensor_ext_forward(torch::Tensor seg_map,
+                                         torch::Tensor height_field,
+                                         int max_height) {
+  CHECK_INPUT(seg_map);
+  CHECK_INPUT(height_field);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  return extrude_tensor_ext_cuda_forward(seg_map, height_field, max_height,
+                                         stream);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &extrude_tensor_ext_forward,
+        "Extrude Tensor Ext. Forward (CUDA)");
+}

citydreamer/extensions/extrude_tensor/extrude_tensor_ext.cu ADDED Viewed

	@@ -0,0 +1,67 @@

+/**
+ * @File:   extrude_tensor_ext.cu
+ * @Author: Haozhe Xie
+ * @Date:   2023-03-26 11:06:18
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2023-05-03 14:55:01
+ * @Email:  [email protected]
+ */
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <torch/extension.h>
+#define CUDA_NUM_THREADS 512
+// Computer the number of threads needed in GPU
+inline int get_n_threads(int n) {
+  const int pow_2 = std::log(static_cast<float>(n)) / std::log(2.0);
+  return max(min(1 << pow_2, CUDA_NUM_THREADS), 1);
+}
+__global__ void extrude_tensor_ext_cuda_kernel(
+    int height, int width, int depth, const int *__restrict__ seg_map,
+    const int *__restrict__ height_field, int *__restrict__ volume) {
+  int batch_index = blockIdx.x;
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  seg_map += batch_index * height * width;
+  height_field += batch_index * height * width;
+  volume += batch_index * height * width * depth;
+  for (int i = index; i < height; i += stride) {
+    int offset_2d_r = i * width, offset_3d_r = i * width * depth;
+    for (int j = 0; j < width; ++j) {
+      int offset_2d_c = offset_2d_r + j, offset_3d_c = offset_3d_r + j * depth;
+      int seg = seg_map[offset_2d_c];
+      int hf = height_field[offset_2d_c];
+      for (int k = 0; k < hf + 1; ++k) {
+        volume[offset_3d_c + k] = seg;
+      }
+    }
+  }
+}
+torch::Tensor extrude_tensor_ext_cuda_forward(torch::Tensor seg_map,
+                                              torch::Tensor height_field,
+                                              int max_height,
+                                              cudaStream_t stream) {
+  int batch_size = seg_map.size(0);
+  int height = seg_map.size(2);
+  int width = seg_map.size(3);
+  torch::Tensor volume = torch::zeros({batch_size, height, width, max_height},
+                                      torch::CUDA(torch::kInt32));
+  extrude_tensor_ext_cuda_kernel<<<
+      batch_size, int(CUDA_NUM_THREADS / CUDA_NUM_THREADS), 0, stream>>>(
+      height, width, max_height, seg_map.data_ptr<int>(),
+      height_field.data_ptr<int>(), volume.data_ptr<int>());
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("Error in extrude_tensor_ext_cuda_forward: %s\n",
+           cudaGetErrorString(err));
+  }
+  return volume;
+}

citydreamer/extensions/extrude_tensor/setup.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   setup.py
+# @Author: Haozhe Xie
+# @Date:   2023-03-24 20:35:43
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-04-29 10:47:30
+# @Email:  [email protected]
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name="extrude_tensor",
+    version="1.0.0",
+    ext_modules=[
+        CUDAExtension(
+            "extrude_tensor_ext",
+            [
+                "bindings.cpp",
+                "extrude_tensor_ext.cu",
+            ],
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)

citydreamer/extensions/extrude_tensor/test.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   test.py
+# @Author: Haozhe Xie
+# @Date:   2023-03-26 19:23:26
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-04-15 10:47:53
+# @Email:  [email protected]
+# Mayavi off screen rendering
+# Ref: https://github.com/enthought/mayavi/issues/477#issuecomment-477653210
+from xvfbwrapper import Xvfb
+vdisplay = Xvfb(width=1920, height=1080)
+vdisplay.start()
+import logging
+import mayavi.mlab
+import numpy as np
+import os
+import sys
+import torch
+import unittest
+from PIL import Image
+from torch.autograd import gradcheck
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+from extensions.extrude_tensor import ExtrudeTensorFunction
+# Disable the warning message for PIL decompression bomb
+# Ref: https://stackoverflow.com/questions/25705773/image-cropping-tool-python
+Image.MAX_IMAGE_PIXELS = None
+class ExtrudeTensorTestCase(unittest.TestCase):
+    @unittest.skip("The CUDA extension is compiled with int types by default.")
+    def test_extrude_tensor_grad(self):
+        # To run this test, make sure that the int types are replaced by double types in CUDA
+        SIZE = 16
+        seg_map = (
+            torch.randint(low=1, high=7, size=(SIZE, SIZE))
+            .double()
+            .unsqueeze(dim=0)
+            .unsqueeze(dim=0)
+        )
+        height_field = (
+            torch.randint(low=0, high=255, size=(SIZE, SIZE))
+            .double()
+            .unsqueeze(dim=0)
+            .unsqueeze(dim=0)
+        )
+        logging.debug("SegMap Size: %s" % (seg_map.size(),))
+        logging.debug("HeightField Size: %s" % (height_field.size(),))
+        seg_map.requires_grad = True
+        height_field.requires_grad = True
+        logging.info(
+            "Gradient Check: %s" % "OK"
+            if gradcheck(
+                ExtrudeTensorFunction.apply, [seg_map.cuda(), height_field.cuda(), 256]
+            )
+            else "Failed"
+        )
+    def test_extrude_tensor_gen(self):
+        MAX_HEIGHT = 256
+        proj_home_dir = os.path.join(
+            os.path.dirname(__file__), os.path.pardir, os.path.pardir
+        )
+        osm_data_dir = os.path.join(proj_home_dir, "data", "osm")
+        osm_name = "US-NewYork"
+        seg_map = Image.open(os.path.join(osm_data_dir, osm_name, "seg.png")).convert(
+            "P"
+        )
+        height_field = Image.open(os.path.join(osm_data_dir, osm_name, "hf.png"))
+        # Crop the maps
+        seg_map = np.array(seg_map)[3840:4096, 3840:4096]
+        height_field = np.array(height_field)[3840:4096, 3840:4096]
+        # Convert to tensors
+        seg_map_tnsr = (
+            torch.from_numpy(seg_map).unsqueeze(dim=0).unsqueeze(dim=0).int().cuda()
+        )
+        height_field_tnsr = (
+            torch.from_numpy(height_field)
+            .unsqueeze(dim=0)
+            .unsqueeze(dim=0)
+            .int()
+            .cuda()
+        )
+        volume = ExtrudeTensorFunction.apply(
+            seg_map_tnsr, height_field_tnsr, MAX_HEIGHT
+        )
+        # 3D Visualization
+        vol = volume.squeeze().cpu().numpy().astype(np.uint8)
+        x, y, z = np.where(vol != 0)
+        n_pts = len(x)
+        colors = np.zeros((n_pts, 4), dtype=np.uint8)
+        # fmt: off
+        colors[vol[x, y, z] == 1] = [96, 0, 0, 255]         # highway      -> red
+        colors[vol[x, y, z] == 2] = [96, 96, 0, 255]        # building     -> yellow
+        colors[vol[x, y, z] == 3] = [0, 96, 0, 255]         # green lands  -> green
+        colors[vol[x, y, z] == 4] = [0, 96, 96, 255]        # construction -> cyan
+        colors[vol[x, y, z] == 5] = [0, 0, 96, 255]         # water        -> blue
+        colors[vol[x, y, z] == 6] = [128, 128, 128, 255]    # ground       -> gray
+        # fmt: on
+        mayavi.mlab.options.offscreen = True
+        mayavi.mlab.figure(size=(1600, 900), bgcolor=(1, 1, 1))
+        pts = mayavi.mlab.points3d(x, y, z, mode="cube", scale_factor=1)
+        pts.glyph.scale_mode = "scale_by_vector"
+        pts.mlab_source.dataset.point_data.scalars = colors
+        mayavi.mlab.savefig(os.path.join(proj_home_dir, "logs", "%s-3d.jpg" % osm_name))
+if __name__ == "__main__":
+    logging.basicConfig(
+        format="[%(levelname)s] %(asctime)s %(message)s",
+        level=logging.INFO,
+    )
+    unittest.main()

citydreamer/extensions/grid_encoder/__init__.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   __init__.py
+# @Author: Jiaxiang Tang (@ashawkey)
+# @Date:   2023-04-15 10:39:28
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-04-15 13:08:46
+# @Email:  [email protected]
+# @Ref: https://github.com/ashawkey/torch-ngp
+import math
+import numpy as np
+import torch
+import grid_encoder_ext
+class GridEncoderFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        inputs,
+        embeddings,
+        offsets,
+        per_level_scale,
+        base_resolution,
+        calc_grad_inputs=False,
+        gridtype=0,
+        align_corners=False,
+    ):
+        # inputs: [B, D], float in [0, 1]
+        # embeddings: [sO, C], float
+        # offsets: [L + 1], int
+        # RETURN: [B, F], float
+        inputs = inputs.contiguous()
+        # batch size, coord dim
+        B, D = inputs.shape
+        # level
+        L = offsets.shape[0] - 1
+        # embedding dim for each level
+        C = embeddings.shape[1]
+        # resolution multiplier at each level, apply log2 for later CUDA exp2f
+        S = math.log2(per_level_scale)
+        # base resolution
+        H = base_resolution
+        # L first, optimize cache for cuda kernel, but needs an extra permute later
+        outputs = torch.empty(L, B, C, device=inputs.device, dtype=embeddings.dtype)
+        if calc_grad_inputs:
+            dy_dx = torch.empty(
+                B, L * D * C, device=inputs.device, dtype=embeddings.dtype
+            )
+        else:
+            dy_dx = torch.empty(
+                1, device=inputs.device, dtype=embeddings.dtype
+            )  # placeholder... TODO: a better way?
+        grid_encoder_ext.forward(
+            inputs,
+            embeddings,
+            offsets,
+            outputs,
+            B,
+            D,
+            C,
+            L,
+            S,
+            H,
+            calc_grad_inputs,
+            dy_dx,
+            gridtype,
+            align_corners,
+        )
+        # permute back to [B, L * C]
+        outputs = outputs.permute(1, 0, 2).reshape(B, L * C)
+        ctx.save_for_backward(inputs, embeddings, offsets, dy_dx)
+        ctx.dims = [B, D, C, L, S, H, gridtype]
+        ctx.calc_grad_inputs = calc_grad_inputs
+        ctx.align_corners = align_corners
+        return outputs
+    @staticmethod
+    def backward(ctx, grad):
+        inputs, embeddings, offsets, dy_dx = ctx.saved_tensors
+        B, D, C, L, S, H, gridtype = ctx.dims
+        calc_grad_inputs = ctx.calc_grad_inputs
+        align_corners = ctx.align_corners
+        # grad: [B, L * C] --> [L, B, C]
+        grad = grad.view(B, L, C).permute(1, 0, 2).contiguous()
+        grad_embeddings = torch.zeros_like(embeddings)
+        if calc_grad_inputs:
+            grad_inputs = torch.zeros_like(inputs, dtype=embeddings.dtype)
+        else:
+            grad_inputs = torch.zeros(1, device=inputs.device, dtype=embeddings.dtype)
+        grid_encoder_ext.backward(
+            grad,
+            inputs,
+            embeddings,
+            offsets,
+            grad_embeddings,
+            B,
+            D,
+            C,
+            L,
+            S,
+            H,
+            calc_grad_inputs,
+            dy_dx,
+            grad_inputs,
+            gridtype,
+            align_corners,
+        )
+        if calc_grad_inputs:
+            grad_inputs = grad_inputs.to(inputs.dtype)
+            return grad_inputs, grad_embeddings, None, None, None, None, None, None
+        else:
+            return None, grad_embeddings, None, None, None, None, None, None
+class GridEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        n_levels,
+        lvl_channels,
+        desired_resolution,
+        per_level_scale=2,
+        base_resolution=16,
+        log2_hashmap_size=19,
+        gridtype="hash",
+        align_corners=False,
+    ):
+        super(GridEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.n_levels = n_levels  # num levels, each level multiply resolution by 2
+        self.lvl_channels = lvl_channels  # encode channels per level
+        self.per_level_scale = 2 ** (
+            math.log2(desired_resolution / base_resolution) / (n_levels - 1)
+        )
+        self.log2_hashmap_size = log2_hashmap_size
+        self.base_resolution = base_resolution
+        self.output_dim = n_levels * lvl_channels
+        self.gridtype = gridtype
+        self.gridtype_id = 0 if gridtype == "hash" else 1
+        self.align_corners = align_corners
+        # allocate parameters
+        offsets = []
+        offset = 0
+        self.max_params = 2**log2_hashmap_size
+        for i in range(n_levels):
+            resolution = int(math.ceil(base_resolution * per_level_scale**i))
+            params_in_level = min(
+                self.max_params,
+                (resolution if align_corners else resolution + 1) ** in_channels,
+            )  # limit max number
+            params_in_level = int(math.ceil(params_in_level / 8) * 8)  # make divisible
+            offsets.append(offset)
+            offset += params_in_level
+        offsets.append(offset)
+        offsets = torch.from_numpy(np.array(offsets, dtype=np.int32))
+        self.register_buffer("offsets", offsets)
+        self.n_params = offsets[-1] * lvl_channels
+        self.embeddings = torch.nn.Parameter(torch.empty(offset, lvl_channels))
+        self._init_weights()
+    def _init_weights(self):
+        self.embeddings.data.uniform_(-1e-4, 1e-4)
+    def forward(self, inputs, bound=1):
+        # inputs: [..., in_channels], normalized real world positions in [-bound, bound]
+        # return: [..., n_levels * lvl_channels]
+        inputs = (inputs + bound) / (2 * bound)  # map to [0, 1]
+        prefix_shape = list(inputs.shape[:-1])
+        inputs = inputs.view(-1, self.in_channels)
+        outputs = GridEncoderFunction.apply(
+            inputs,
+            self.embeddings,
+            self.offsets,
+            self.per_level_scale,
+            self.base_resolution,
+            inputs.requires_grad,
+            self.gridtype_id,
+            self.align_corners,
+        )
+        return outputs.view(prefix_shape + [self.output_dim])

citydreamer/extensions/grid_encoder/bindings.cpp ADDED Viewed

	@@ -0,0 +1,40 @@

+/**
+ * @File:   grid_encoder_ext_cuda.cpp
+ * @Author: Jiaxiang Tang (@ashawkey)
+ * @Date:   2023-04-15 10:39:17
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2023-04-15 11:01:32
+ * @Email:  [email protected]
+ * @Ref: https://github.com/ashawkey/torch-ngp
+ */
+#include <stdint.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [B, L * C], float
+// H: base resolution
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings,
+                         const at::Tensor offsets, at::Tensor outputs,
+                         const uint32_t B, const uint32_t D, const uint32_t C,
+                         const uint32_t L, const float S, const uint32_t H,
+                         const bool calc_grad_inputs, at::Tensor dy_dx,
+                         const uint32_t gridtype, const bool align_corners);
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs,
+                          const at::Tensor embeddings, const at::Tensor offsets,
+                          at::Tensor grad_embeddings, const uint32_t B,
+                          const uint32_t D, const uint32_t C, const uint32_t L,
+                          const float S, const uint32_t H,
+                          const bool calc_grad_inputs, const at::Tensor dy_dx,
+                          at::Tensor grad_inputs, const uint32_t gridtype,
+                          const bool align_corners);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &grid_encode_forward,
+        "grid_encode_forward (CUDA)");
+  m.def("backward", &grid_encode_backward,
+        "grid_encode_backward (CUDA)");
+}

citydreamer/extensions/grid_encoder/grid_encoder_ext.cu ADDED Viewed

	@@ -0,0 +1,605 @@

+/**
+ * @File:   grid_encoder_ext.cu
+ * @Author: Jiaxiang Tang (@ashawkey)
+ * @Date:   2023-04-15 10:43:16
+ * @Last Modified by: Haozhe Xie
+ * @Last Modified at: 2023-04-29 11:47:54
+ * @Email:  [email protected]
+ * @Ref: https://github.com/ashawkey/torch-ngp
+ */
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>
+#include <stdint.h>
+#define CHECK_CUDA(x)                                                          \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor")
+#define CHECK_IS_INT(x)                                                        \
+  TORCH_CHECK(x.scalar_type() == at::ScalarType::Int,                          \
+              #x " must be an int tensor")
+#define CHECK_IS_FLOATING(x)                                                   \
+  TORCH_CHECK(x.scalar_type() == at::ScalarType::Float ||                      \
+                  x.scalar_type() == at::ScalarType::Half ||                   \
+                  x.scalar_type() == at::ScalarType::Double,                   \
+              #x " must be a floating tensor")
+// just for compatability of half precision in
+// AT_DISPATCH_FLOATING_TYPES_AND_HALF...
+static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+  // requires CUDA >= 10 and ARCH >= 70
+  // this is very slow compared to float or __half2, and never used.
+  // return atomicAdd(reinterpret_cast<__half*>(address), val);
+}
+template <typename T>
+static inline __host__ __device__ T div_round_up(T val, T divisor) {
+  return (val + divisor - 1) / divisor;
+}
+template <uint32_t D>
+__device__ uint32_t fast_hash(const uint32_t pos_grid[D]) {
+  static_assert(D <= 7, "fast_hash can only hash up to 7 dimensions.");
+  // While 1 is technically not a good prime for hashing (or a prime at all), it
+  // helps memory coherence and is sufficient for our use case of obtaining a
+  // uniformly colliding index from high-dimensional coordinates.
+  constexpr uint32_t primes[7] = {1,          2654435761, 805459861, 3674653429,
+                                  2097192037, 1434869437, 2165219737};
+  uint32_t result = 0;
+#pragma unroll
+  for (uint32_t i = 0; i < D; ++i) {
+    result ^= pos_grid[i] * primes[i];
+  }
+  return result;
+}
+template <uint32_t D, uint32_t C>
+__device__ uint32_t get_grid_index(const uint32_t gridtype,
+                                   const bool align_corners, const uint32_t ch,
+                                   const uint32_t hashmap_size,
+                                   const uint32_t resolution,
+                                   const uint32_t pos_grid[D]) {
+  uint32_t stride = 1;
+  uint32_t index = 0;
+#pragma unroll
+  for (uint32_t d = 0; d < D && stride <= hashmap_size; d++) {
+    index += pos_grid[d] * stride;
+    stride *= align_corners ? resolution : (resolution + 1);
+  }
+  // NOTE: for NeRF, the hash is in fact not necessary. Check
+  // https://github.com/NVlabs/instant-ngp/issues/97. gridtype: 0 == hash, 1 ==
+  // tiled
+  if (gridtype == 0 && stride > hashmap_size) {
+    index = fast_hash<D>(pos_grid);
+  }
+  return (index % hashmap_size) * C + ch;
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void
+kernel_grid(const float *__restrict__ inputs, const scalar_t *__restrict__ grid,
+            const int *__restrict__ offsets, scalar_t *__restrict__ outputs,
+            const uint32_t B, const uint32_t L, const float S, const uint32_t H,
+            const bool calc_grad_inputs, scalar_t *__restrict__ dy_dx,
+            const uint32_t gridtype, const bool align_corners) {
+  const uint32_t b = blockIdx.x * blockDim.x + threadIdx.x;
+  if (b >= B)
+    return;
+  const uint32_t level = blockIdx.y;
+  // locate
+  grid += (uint32_t)offsets[level] * C;
+  inputs += b * D;
+  outputs += level * B * C + b * C;
+  // check input range (should be in [0, 1])
+  bool flag_oob = false;
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    if (inputs[d] < 0 || inputs[d] > 1) {
+      flag_oob = true;
+    }
+  }
+  // if input out of bound, just set output to 0
+  if (flag_oob) {
+#pragma unroll
+    for (uint32_t ch = 0; ch < C; ch++) {
+      outputs[ch] = 0;
+    }
+    if (calc_grad_inputs) {
+      dy_dx += b * D * L * C + level * D * C; // B L D C
+#pragma unroll
+      for (uint32_t d = 0; d < D; d++) {
+#pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+          dy_dx[d * C + ch] = 0;
+        }
+      }
+    }
+    return;
+  }
+  const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+  const float scale = exp2f(level * S) * H - 1.0f;
+  const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+  // calculate coordinate
+  float pos[D];
+  uint32_t pos_grid[D];
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+    pos_grid[d] = floorf(pos[d]);
+    pos[d] -= (float)pos_grid[d];
+  }
+  // printf("[b=%d, l=%d] pos=(%f, %f)+(%d, %d)\n", b, level, pos[0], pos[1],
+  // pos_grid[0], pos_grid[1]);
+  // interpolate
+  scalar_t results[C] = {0}; // temp results in register
+#pragma unroll
+  for (uint32_t idx = 0; idx < (1 << D); idx++) {
+    float w = 1;
+    uint32_t pos_grid_local[D];
+#pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+      if ((idx & (1 << d)) == 0) {
+        w *= 1 - pos[d];
+        pos_grid_local[d] = pos_grid[d];
+      } else {
+        w *= pos[d];
+        pos_grid_local[d] = pos_grid[d] + 1;
+      }
+    }
+    uint32_t index = get_grid_index<D, C>(
+        gridtype, align_corners, 0, hashmap_size, resolution, pos_grid_local);
+// writing to register (fast)
+#pragma unroll
+    for (uint32_t ch = 0; ch < C; ch++) {
+      results[ch] += w * grid[index + ch];
+    }
+    // printf("[b=%d, l=%d] int %d, idx %d, w %f, val %f\n", b, level, idx,
+    // index, w, grid[index]);
+  }
+// writing to global memory (slow)
+#pragma unroll
+  for (uint32_t ch = 0; ch < C; ch++) {
+    outputs[ch] = results[ch];
+  }
+  // prepare dy_dx for calc_grad_inputs
+  // differentiable (soft) indexing:
+  // https://discuss.pytorch.org/t/differentiable-indexing/17647/9
+  if (calc_grad_inputs) {
+    dy_dx += b * D * L * C + level * D * C; // B L D C
+#pragma unroll
+    for (uint32_t gd = 0; gd < D; gd++) {
+      scalar_t results_grad[C] = {0};
+#pragma unroll
+      for (uint32_t idx = 0; idx < (1 << (D - 1)); idx++) {
+        float w = scale;
+        uint32_t pos_grid_local[D];
+#pragma unroll
+        for (uint32_t nd = 0; nd < D - 1; nd++) {
+          const uint32_t d = (nd >= gd) ? (nd + 1) : nd;
+          if ((idx & (1 << nd)) == 0) {
+            w *= 1 - pos[d];
+            pos_grid_local[d] = pos_grid[d];
+          } else {
+            w *= pos[d];
+            pos_grid_local[d] = pos_grid[d] + 1;
+          }
+        }
+        pos_grid_local[gd] = pos_grid[gd];
+        uint32_t index_left =
+            get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size,
+                                 resolution, pos_grid_local);
+        pos_grid_local[gd] = pos_grid[gd] + 1;
+        uint32_t index_right =
+            get_grid_index<D, C>(gridtype, align_corners, 0, hashmap_size,
+                                 resolution, pos_grid_local);
+#pragma unroll
+        for (uint32_t ch = 0; ch < C; ch++) {
+          results_grad[ch] +=
+              w * (grid[index_right + ch] - grid[index_left + ch]);
+        }
+      }
+#pragma unroll
+      for (uint32_t ch = 0; ch < C; ch++) {
+        dy_dx[gd * C + ch] = results_grad[ch];
+      }
+    }
+  }
+}
+template <typename scalar_t, uint32_t D, uint32_t C, uint32_t N_C>
+__global__ void kernel_grid_backward(
+    const scalar_t *__restrict__ grad, const float *__restrict__ inputs,
+    const scalar_t *__restrict__ grid, const int *__restrict__ offsets,
+    scalar_t *__restrict__ grad_grid, const uint32_t B, const uint32_t L,
+    const float S, const uint32_t H, const uint32_t gridtype,
+    const bool align_corners) {
+  const uint32_t b = (blockIdx.x * blockDim.x + threadIdx.x) * N_C / C;
+  if (b >= B)
+    return;
+  const uint32_t level = blockIdx.y;
+  const uint32_t ch = (blockIdx.x * blockDim.x + threadIdx.x) * N_C - b * C;
+  // locate
+  grad_grid += offsets[level] * C;
+  inputs += b * D;
+  grad += level * B * C + b * C + ch; // L, B, C
+  const uint32_t hashmap_size = offsets[level + 1] - offsets[level];
+  const float scale = exp2f(level * S) * H - 1.0f;
+  const uint32_t resolution = (uint32_t)ceil(scale) + 1;
+// check input range (should be in [0, 1])
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    if (inputs[d] < 0 || inputs[d] > 1) {
+      return; // grad is init as 0, so we simply return.
+    }
+  }
+  // calculate coordinate
+  float pos[D];
+  uint32_t pos_grid[D];
+#pragma unroll
+  for (uint32_t d = 0; d < D; d++) {
+    pos[d] = inputs[d] * scale + (align_corners ? 0.0f : 0.5f);
+    pos_grid[d] = floorf(pos[d]);
+    pos[d] -= (float)pos_grid[d];
+  }
+  scalar_t grad_cur[N_C] = {0}; // fetch to register
+#pragma unroll
+  for (uint32_t c = 0; c < N_C; c++) {
+    grad_cur[c] = grad[c];
+  }
+// interpolate
+#pragma unroll
+  for (uint32_t idx = 0; idx < (1 << D); idx++) {
+    float w = 1;
+    uint32_t pos_grid_local[D];
+#pragma unroll
+    for (uint32_t d = 0; d < D; d++) {
+      if ((idx & (1 << d)) == 0) {
+        w *= 1 - pos[d];
+        pos_grid_local[d] = pos_grid[d];
+      } else {
+        w *= pos[d];
+        pos_grid_local[d] = pos_grid[d] + 1;
+      }
+    }
+    uint32_t index = get_grid_index<D, C>(
+        gridtype, align_corners, ch, hashmap_size, resolution, pos_grid_local);
+    // atomicAdd for __half is slow (especially for large values), so we use
+    // __half2 if N_C % 2 == 0
+    // TODO: use float which is better than __half, if N_C % 2 != 0
+    if (std::is_same<scalar_t, at::Half>::value && N_C % 2 == 0) {
+#pragma unroll
+      for (uint32_t c = 0; c < N_C; c += 2) {
+        // process two __half at once (by interpreting as a __half2)
+        __half2 v = {(__half)(w * grad_cur[c]), (__half)(w * grad_cur[c + 1])};
+        atomicAdd((__half2 *)&grad_grid[index + c], v);
+      }
+      // float, or __half when N_C % 2 != 0 (which means C == 1)
+    } else {
+#pragma unroll
+      for (uint32_t c = 0; c < N_C; c++) {
+        atomicAdd(&grad_grid[index + c], w * grad_cur[c]);
+      }
+    }
+  }
+}
+template <typename scalar_t, uint32_t D, uint32_t C>
+__global__ void kernel_input_backward(const scalar_t *__restrict__ grad,
+                                      const scalar_t *__restrict__ dy_dx,
+                                      scalar_t *__restrict__ grad_inputs,
+                                      uint32_t B, uint32_t L) {
+  const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x;
+  if (t >= B * D)
+    return;
+  const uint32_t b = t / D;
+  const uint32_t d = t - b * D;
+  dy_dx += b * L * D * C;
+  scalar_t result = 0;
+#pragma unroll
+  for (int l = 0; l < L; l++) {
+#pragma unroll
+    for (int ch = 0; ch < C; ch++) {
+      result += grad[l * B * C + b * C + ch] * dy_dx[l * D * C + d * C + ch];
+    }
+  }
+  grad_inputs[t] = result;
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_wrapper(const float *inputs, const scalar_t *embeddings,
+                         const int *offsets, scalar_t *outputs,
+                         const uint32_t B, const uint32_t C, const uint32_t L,
+                         const float S, const uint32_t H,
+                         const bool calc_grad_inputs, scalar_t *dy_dx,
+                         const uint32_t gridtype, const bool align_corners) {
+  static constexpr uint32_t N_THREAD = 512;
+  const dim3 blocks_hashgrid = {div_round_up(B, N_THREAD), L, 1};
+  switch (C) {
+  case 1:
+    kernel_grid<scalar_t, D, 1><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  case 2:
+    kernel_grid<scalar_t, D, 2><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  case 4:
+    kernel_grid<scalar_t, D, 4><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  case 8:
+    kernel_grid<scalar_t, D, 8><<<blocks_hashgrid, N_THREAD>>>(
+        inputs, embeddings, offsets, outputs, B, L, S, H, calc_grad_inputs,
+        dy_dx, gridtype, align_corners);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// outputs: [L, B, C], float (L first, so only one level of hashmap needs to fit
+// into cache at a time.) H: base resolution dy_dx: [B, L * D * C]
+template <typename scalar_t>
+void grid_encode_forward_cuda(const float *inputs, const scalar_t *embeddings,
+                              const int *offsets, scalar_t *outputs,
+                              const uint32_t B, const uint32_t D,
+                              const uint32_t C, const uint32_t L, const float S,
+                              const uint32_t H, const bool calc_grad_inputs,
+                              scalar_t *dy_dx, const uint32_t gridtype,
+                              const bool align_corners) {
+  switch (D) {
+  case 2:
+    kernel_grid_wrapper<scalar_t, 2>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  case 3:
+    kernel_grid_wrapper<scalar_t, 3>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  case 4:
+    kernel_grid_wrapper<scalar_t, 4>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  case 5:
+    kernel_grid_wrapper<scalar_t, 5>(inputs, embeddings, offsets, outputs, B, C,
+                                     L, S, H, calc_grad_inputs, dy_dx, gridtype,
+                                     align_corners);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+template <typename scalar_t, uint32_t D>
+void kernel_grid_backward_wrapper(
+    const scalar_t *grad, const float *inputs, const scalar_t *embeddings,
+    const int *offsets, scalar_t *grad_embeddings, const uint32_t B,
+    const uint32_t C, const uint32_t L, const float S, const uint32_t H,
+    const bool calc_grad_inputs, scalar_t *dy_dx, scalar_t *grad_inputs,
+    const uint32_t gridtype, const bool align_corners) {
+  static constexpr uint32_t N_THREAD = 256;
+  const uint32_t N_C = std::min(2u, C); // n_features_per_thread
+  const dim3 blocks_hashgrid = {div_round_up(B * C / N_C, N_THREAD), L, 1};
+  switch (C) {
+  case 1:
+    kernel_grid_backward<scalar_t, D, 1, 1><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 1>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  case 2:
+    kernel_grid_backward<scalar_t, D, 2, 2><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 2>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  case 4:
+    kernel_grid_backward<scalar_t, D, 4, 2><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 4>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  case 8:
+    kernel_grid_backward<scalar_t, D, 8, 2><<<blocks_hashgrid, N_THREAD>>>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, L, S, H,
+        gridtype, align_corners);
+    if (calc_grad_inputs)
+      kernel_input_backward<scalar_t, D, 8>
+          <<<div_round_up(B * D, N_THREAD), N_THREAD>>>(grad, dy_dx,
+                                                        grad_inputs, B, L);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+// grad: [L, B, C], float
+// inputs: [B, D], float, in [0, 1]
+// embeddings: [sO, C], float
+// offsets: [L + 1], uint32_t
+// grad_embeddings: [sO, C]
+// H: base resolution
+template <typename scalar_t>
+void grid_encode_backward_cuda(
+    const scalar_t *grad, const float *inputs, const scalar_t *embeddings,
+    const int *offsets, scalar_t *grad_embeddings, const uint32_t B,
+    const uint32_t D, const uint32_t C, const uint32_t L, const float S,
+    const uint32_t H, const bool calc_grad_inputs, scalar_t *dy_dx,
+    scalar_t *grad_inputs, const uint32_t gridtype, const bool align_corners) {
+  switch (D) {
+  case 2:
+    kernel_grid_backward_wrapper<scalar_t, 2>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  case 3:
+    kernel_grid_backward_wrapper<scalar_t, 3>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  case 4:
+    kernel_grid_backward_wrapper<scalar_t, 4>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  case 5:
+    kernel_grid_backward_wrapper<scalar_t, 5>(
+        grad, inputs, embeddings, offsets, grad_embeddings, B, C, L, S, H,
+        calc_grad_inputs, dy_dx, grad_inputs, gridtype, align_corners);
+    break;
+  default:
+    throw std::runtime_error{"GridEncoding: C must be 1, 2, 4, or 8."};
+  }
+}
+void grid_encode_forward(const at::Tensor inputs, const at::Tensor embeddings,
+                         const at::Tensor offsets, at::Tensor outputs,
+                         const uint32_t B, const uint32_t D, const uint32_t C,
+                         const uint32_t L, const float S, const uint32_t H,
+                         const bool calc_grad_inputs, at::Tensor dy_dx,
+                         const uint32_t gridtype, const bool align_corners) {
+  CHECK_CUDA(inputs);
+  CHECK_CUDA(embeddings);
+  CHECK_CUDA(offsets);
+  CHECK_CUDA(outputs);
+  CHECK_CUDA(dy_dx);
+  CHECK_CONTIGUOUS(inputs);
+  CHECK_CONTIGUOUS(embeddings);
+  CHECK_CONTIGUOUS(offsets);
+  CHECK_CONTIGUOUS(outputs);
+  CHECK_CONTIGUOUS(dy_dx);
+  CHECK_IS_FLOATING(inputs);
+  CHECK_IS_FLOATING(embeddings);
+  CHECK_IS_INT(offsets);
+  CHECK_IS_FLOATING(outputs);
+  CHECK_IS_FLOATING(dy_dx);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      embeddings.scalar_type(), "grid_encode_forward", ([&] {
+        grid_encode_forward_cuda<scalar_t>(
+            inputs.data_ptr<float>(), embeddings.data_ptr<scalar_t>(),
+            offsets.data_ptr<int>(), outputs.data_ptr<scalar_t>(), B, D, C, L,
+            S, H, calc_grad_inputs, dy_dx.data_ptr<scalar_t>(), gridtype,
+            align_corners);
+      }));
+}
+void grid_encode_backward(const at::Tensor grad, const at::Tensor inputs,
+                          const at::Tensor embeddings, const at::Tensor offsets,
+                          at::Tensor grad_embeddings, const uint32_t B,
+                          const uint32_t D, const uint32_t C, const uint32_t L,
+                          const float S, const uint32_t H,
+                          const bool calc_grad_inputs, const at::Tensor dy_dx,
+                          at::Tensor grad_inputs, const uint32_t gridtype,
+                          const bool align_corners) {
+  CHECK_CUDA(grad);
+  CHECK_CUDA(inputs);
+  CHECK_CUDA(embeddings);
+  CHECK_CUDA(offsets);
+  CHECK_CUDA(grad_embeddings);
+  CHECK_CUDA(dy_dx);
+  CHECK_CUDA(grad_inputs);
+  CHECK_CONTIGUOUS(grad);
+  CHECK_CONTIGUOUS(inputs);
+  CHECK_CONTIGUOUS(embeddings);
+  CHECK_CONTIGUOUS(offsets);
+  CHECK_CONTIGUOUS(grad_embeddings);
+  CHECK_CONTIGUOUS(dy_dx);
+  CHECK_CONTIGUOUS(grad_inputs);
+  CHECK_IS_FLOATING(grad);
+  CHECK_IS_FLOATING(inputs);
+  CHECK_IS_FLOATING(embeddings);
+  CHECK_IS_INT(offsets);
+  CHECK_IS_FLOATING(grad_embeddings);
+  CHECK_IS_FLOATING(dy_dx);
+  CHECK_IS_FLOATING(grad_inputs);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "grid_encode_backward", ([&] {
+        grid_encode_backward_cuda<scalar_t>(
+            grad.data_ptr<scalar_t>(), inputs.data_ptr<float>(),
+            embeddings.data_ptr<scalar_t>(), offsets.data_ptr<int>(),
+            grad_embeddings.data_ptr<scalar_t>(), B, D, C, L, S, H,
+            calc_grad_inputs, dy_dx.data_ptr<scalar_t>(),
+            grad_inputs.data_ptr<scalar_t>(), gridtype, align_corners);
+      }));
+}

citydreamer/extensions/grid_encoder/setup.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   setup.py
+# @Author: Jiaxiang Tang (@ashawkey)
+# @Date:   2023-04-15 10:33:32
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2023-04-29 10:47:10
+# @Email:  [email protected]
+# @Ref: https://github.com/ashawkey/torch-ngp
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+setup(
+    name="grid_encoder",
+    version="1.0.0",
+    ext_modules=[
+        CUDAExtension(
+            name="grid_encoder_ext",
+            sources=[
+                "grid_encoder_ext.cu",
+                "bindings.cpp",
+            ],
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++14"],
+                "nvcc": [
+                    "-O3",
+                    "-std=c++14",
+                    "-U__CUDA_NO_HALF_OPERATORS__",
+                    "-U__CUDA_NO_HALF_CONVERSIONS__",
+                    "-U__CUDA_NO_HALF2_OPERATORS__",
+                ],
+            },
+        ),
+    ],
+    cmdclass={
+        "build_ext": BuildExtension,
+    },
+)

citydreamer/extensions/voxlib/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+from voxlib import ray_voxel_intersection_perspective

citydreamer/extensions/voxlib/ray_voxel_intersection.cu ADDED Viewed

	@@ -0,0 +1,351 @@

+// Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, check out LICENSE.md
+//
+// The ray marching algorithm used in this file is a variety of modified
+// Bresenham method:
+// http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.42.3443&rep=rep1&type=pdf
+// Search for "voxel traversal algorithm" for related information
+#include <torch/types.h>
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <time.h>
+//#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <vector>
+#include "voxlib_common.h"
+struct RVIP_Params {
+  int voxel_dims[3];
+  int voxel_strides[3];
+  int max_samples;
+  int img_dims[2];
+  // Camera parameters
+  float cam_ori[3];
+  float cam_fwd[3];
+  float cam_side[3];
+  float cam_up[3];
+  float cam_c[2];
+  float cam_f;
+  // unsigned long seed;
+};
+/*
+    out_voxel_id: torch CUDA int32  [   img_dims[0], img_dims[1], max_samples,
+   1] out_depth:    torch CUDA float  [2, img_dims[0], img_dims[1], max_samples,
+   1] out_raydirs:  torch CUDA float  [   img_dims[0], img_dims[1],           1,
+   3] Image coordinates refer to the center of the pixel [0, 0, 0] at voxel
+   coordinate is at the corner of the corner block (instead of at the center)
+*/
+template <int TILE_DIM>
+static __global__ void ray_voxel_intersection_perspective_kernel(
+    int32_t *__restrict__ out_voxel_id, float *__restrict__ out_depth,
+    float *__restrict__ out_raydirs, const int32_t *__restrict__ in_voxel,
+    const RVIP_Params p) {
+  int img_coords[2];
+  img_coords[1] = blockIdx.x * TILE_DIM + threadIdx.x;
+  img_coords[0] = blockIdx.y * TILE_DIM + threadIdx.y;
+  if (img_coords[0] >= p.img_dims[0] || img_coords[1] >= p.img_dims[1]) {
+    return;
+  }
+  int pix_index = img_coords[0] * p.img_dims[1] + img_coords[1];
+  // Calculate ray origin and direction
+  float rayori[3], raydir[3];
+  rayori[0] = p.cam_ori[0];
+  rayori[1] = p.cam_ori[1];
+  rayori[2] = p.cam_ori[2];
+  // Camera intrinsics
+  float ndc_imcoords[2];
+  ndc_imcoords[0] = p.cam_c[0] - (float)img_coords[0]; // Flip height
+  ndc_imcoords[1] = (float)img_coords[1] - p.cam_c[1];
+  raydir[0] = p.cam_up[0] * ndc_imcoords[0] + p.cam_side[0] * ndc_imcoords[1] +
+              p.cam_fwd[0] * p.cam_f;
+  raydir[1] = p.cam_up[1] * ndc_imcoords[0] + p.cam_side[1] * ndc_imcoords[1] +
+              p.cam_fwd[1] * p.cam_f;
+  raydir[2] = p.cam_up[2] * ndc_imcoords[0] + p.cam_side[2] * ndc_imcoords[1] +
+              p.cam_fwd[2] * p.cam_f;
+  normalize<float, 3>(raydir);
+  // Save out_raydirs
+  out_raydirs[pix_index * 3] = raydir[0];
+  out_raydirs[pix_index * 3 + 1] = raydir[1];
+  out_raydirs[pix_index * 3 + 2] = raydir[2];
+  float axis_t[3];
+  int axis_int[3];
+  // int axis_intbound[3];
+  // Current voxel
+  axis_int[0] = floorf(rayori[0]);
+  axis_int[1] = floorf(rayori[1]);
+  axis_int[2] = floorf(rayori[2]);
+#pragma unroll
+  for (int i = 0; i < 3; i++) {
+    if (raydir[i] > 0) {
+      // Initial t value
+      // Handle boundary case where rayori[i] is a whole number. Always round Up
+      // for the next block
+      // axis_t[i] = (ceilf(nextafterf(rayori[i], HUGE_VALF)) - rayori[i]) /
+      // raydir[i];
+      axis_t[i] = ((float)(axis_int[i] + 1) - rayori[i]) / raydir[i];
+    } else if (raydir[i] < 0) {
+      axis_t[i] = ((float)axis_int[i] - rayori[i]) / raydir[i];
+    } else {
+      axis_t[i] = HUGE_VALF;
+    }
+  }
+  // Fused raymarching and sampling
+  bool quit = false;
+  for (int cur_plane = 0; cur_plane < p.max_samples;
+       cur_plane++) { // Last cycle is for calculating p2
+    float t = nanf("0");
+    float t2 = nanf("0");
+    int32_t blk_id = 0;
+    // Find the next intersection
+    while (!quit) {
+      // Find the next smallest t
+      float tnow;
+      /*
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+          if (axis_t[i] <= axis_t[(i+1)%3] && axis_t[i] <= axis_t[(i+2)%3]) {
+              // Update current t
+              tnow = axis_t[i];
+              // Update t candidates
+              if (raydir[i] > 0) {
+                  axis_int[i] += 1;
+                  if (axis_int[i] >= p.voxel_dims[i]) {
+                      quit = true;
+                  }
+                  axis_t[i] = ((float)(axis_int[i]+1) - rayori[i]) / raydir[i];
+              } else {
+                  axis_int[i] -= 1;
+                  if (axis_int[i] < 0) {
+                      quit = true;
+                  }
+                  axis_t[i] = ((float)axis_int[i] - rayori[i]) / raydir[i];
+              }
+              break; // Avoid advancing multiple steps as axis_t is updated
+          }
+      }
+      */
+      // Hand unroll
+      if (axis_t[0] <= axis_t[1] && axis_t[0] <= axis_t[2]) {
+        // Update current t
+        tnow = axis_t[0];
+        // Update t candidates
+        if (raydir[0] > 0) {
+          axis_int[0] += 1;
+          if (axis_int[0] >= p.voxel_dims[0]) {
+            quit = true;
+          }
+          axis_t[0] = ((float)(axis_int[0] + 1) - rayori[0]) / raydir[0];
+        } else {
+          axis_int[0] -= 1;
+          if (axis_int[0] < 0) {
+            quit = true;
+          }
+          axis_t[0] = ((float)axis_int[0] - rayori[0]) / raydir[0];
+        }
+      } else if (axis_t[1] <= axis_t[2]) {
+        tnow = axis_t[1];
+        if (raydir[1] > 0) {
+          axis_int[1] += 1;
+          if (axis_int[1] >= p.voxel_dims[1]) {
+            quit = true;
+          }
+          axis_t[1] = ((float)(axis_int[1] + 1) - rayori[1]) / raydir[1];
+        } else {
+          axis_int[1] -= 1;
+          if (axis_int[1] < 0) {
+            quit = true;
+          }
+          axis_t[1] = ((float)axis_int[1] - rayori[1]) / raydir[1];
+        }
+      } else {
+        tnow = axis_t[2];
+        if (raydir[2] > 0) {
+          axis_int[2] += 1;
+          if (axis_int[2] >= p.voxel_dims[2]) {
+            quit = true;
+          }
+          axis_t[2] = ((float)(axis_int[2] + 1) - rayori[2]) / raydir[2];
+        } else {
+          axis_int[2] -= 1;
+          if (axis_int[2] < 0) {
+            quit = true;
+          }
+          axis_t[2] = ((float)axis_int[2] - rayori[2]) / raydir[2];
+        }
+      }
+      if (quit) {
+        break;
+      }
+      // Skip empty space
+      // Could there be deadlock if the ray direction is away from the world?
+      if (axis_int[0] < 0 || axis_int[0] >= p.voxel_dims[0] ||
+          axis_int[1] < 0 || axis_int[1] >= p.voxel_dims[1] ||
+          axis_int[2] < 0 || axis_int[2] >= p.voxel_dims[2]) {
+        continue;
+      }
+      // Test intersection using voxel grid
+      blk_id = in_voxel[axis_int[0] * p.voxel_strides[0] +
+                        axis_int[1] * p.voxel_strides[1] +
+                        axis_int[2] * p.voxel_strides[2]];
+      if (blk_id == 0) {
+        continue;
+      }
+      // Now that there is an intersection
+      t = tnow;
+      // Calculate t2
+      /*
+      #pragma unroll
+      for (int i=0; i<3; i++) {
+          if (axis_t[i] <= axis_t[(i+1)%3] && axis_t[i] <= axis_t[(i+2)%3]) {
+              t2 = axis_t[i];
+              break;
+          }
+      }
+      */
+      // Hand unroll
+      if (axis_t[0] <= axis_t[1] && axis_t[0] <= axis_t[2]) {
+        t2 = axis_t[0];
+      } else if (axis_t[1] <= axis_t[2]) {
+        t2 = axis_t[1];
+      } else {
+        t2 = axis_t[2];
+      }
+      break;
+    } // while !quit (ray marching loop)
+    out_depth[pix_index * p.max_samples + cur_plane] = t;
+    out_depth[p.img_dims[0] * p.img_dims[1] * p.max_samples +
+              pix_index * p.max_samples + cur_plane] = t2;
+    out_voxel_id[pix_index * p.max_samples + cur_plane] = blk_id;
+  } // cur_plane
+}
+/*
+    out:
+        out_voxel_id: torch CUDA int32  [   img_dims[0], img_dims[1],
+   max_samples, 1] out_depth:    torch CUDA float  [2, img_dims[0], img_dims[1],
+   max_samples, 1] out_raydirs:  torch CUDA float  [   img_dims[0], img_dims[1],
+   1, 3] in: in_voxel:     torch CUDA int32  [X, Y, Z] [40, 512, 512] cam_ori:
+   torch      float  [3] cam_dir:      torch      float  [3] cam_up:       torch
+   float  [3] cam_f:                   float cam_c:                   int    [2]
+        img_dims:                int    [2]
+        max_samples:             int
+*/
+std::vector<torch::Tensor> ray_voxel_intersection_perspective_cuda(
+    const torch::Tensor &in_voxel, const torch::Tensor &cam_ori,
+    const torch::Tensor &cam_dir, const torch::Tensor &cam_up, float cam_f,
+    const std::vector<float> &cam_c, const std::vector<int> &img_dims,
+    int max_samples) {
+  CHECK_CUDA(in_voxel);
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+  torch::Device device = in_voxel.device();
+  // assert(in_voxel.dtype() == torch::kU8);
+  assert(in_voxel.dtype() == torch::kInt32); // Minecraft compatibility
+  assert(in_voxel.dim() == 3);
+  assert(cam_ori.dtype() == torch::kFloat32);
+  assert(cam_ori.numel() == 3);
+  assert(cam_dir.dtype() == torch::kFloat32);
+  assert(cam_dir.numel() == 3);
+  assert(cam_up.dtype() == torch::kFloat32);
+  assert(cam_up.numel() == 3);
+  assert(img_dims.size() == 2);
+  RVIP_Params p;
+  // Calculate camera rays
+  const torch::Tensor cam_ori_c = cam_ori.cpu();
+  const torch::Tensor cam_dir_c = cam_dir.cpu();
+  const torch::Tensor cam_up_c = cam_up.cpu();
+  // Get the coordinate frame of camera space in world space
+  normalize<float, 3>(p.cam_fwd, cam_dir_c.data_ptr<float>());
+  cross<float>(p.cam_side, p.cam_fwd, cam_up_c.data_ptr<float>());
+  normalize<float, 3>(p.cam_side);
+  cross<float>(p.cam_up, p.cam_side, p.cam_fwd);
+  normalize<float, 3>(p.cam_up); // Not absolutely necessary as both vectors are
+                                 // normalized. But just in case...
+  copyarr<float, 3>(p.cam_ori, cam_ori_c.data_ptr<float>());
+  p.cam_f = cam_f;
+  p.cam_c[0] = cam_c[0];
+  p.cam_c[1] = cam_c[1];
+  p.max_samples = max_samples;
+  // printf("[Renderer] max_dist: %ld\n", max_dist);
+  p.voxel_dims[0] = in_voxel.size(0);
+  p.voxel_dims[1] = in_voxel.size(1);
+  p.voxel_dims[2] = in_voxel.size(2);
+  p.voxel_strides[0] = in_voxel.stride(0);
+  p.voxel_strides[1] = in_voxel.stride(1);
+  p.voxel_strides[2] = in_voxel.stride(2);
+  // printf("[Renderer] Voxel resolution: %ld, %ld, %ld\n", p.voxel_dims[0],
+  // p.voxel_dims[1], p.voxel_dims[2]);
+  p.img_dims[0] = img_dims[0];
+  p.img_dims[1] = img_dims[1];
+  // Create output tensors
+  // For Minecraft Seg Mask
+  torch::Tensor out_voxel_id =
+      torch::empty({p.img_dims[0], p.img_dims[1], p.max_samples, 1},
+                   torch::TensorOptions().dtype(torch::kInt32).device(device));
+  torch::Tensor out_depth;
+  // Produce two sets of localcoords, one for entry point, the other one for
+  // exit point. They share the same corner_ids.
+  out_depth = torch::empty(
+      {2, p.img_dims[0], p.img_dims[1], p.max_samples, 1},
+      torch::TensorOptions().dtype(torch::kFloat32).device(device));
+  torch::Tensor out_raydirs = torch::empty({p.img_dims[0], p.img_dims[1], 1, 3},
+                                           torch::TensorOptions()
+                                               .dtype(torch::kFloat32)
+                                               .device(device)
+                                               .requires_grad(false));
+  const int TILE_DIM = 8;
+  dim3 dimGrid((p.img_dims[1] + TILE_DIM - 1) / TILE_DIM,
+               (p.img_dims[0] + TILE_DIM - 1) / TILE_DIM, 1);
+  dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
+  ray_voxel_intersection_perspective_kernel<TILE_DIM>
+      <<<dimGrid, dimBlock, 0, stream>>>(
+          out_voxel_id.data_ptr<int32_t>(), out_depth.data_ptr<float>(),
+          out_raydirs.data_ptr<float>(), in_voxel.data_ptr<int32_t>(), p);
+  return {out_voxel_id, out_depth, out_raydirs};
+}

citydreamer/extensions/voxlib/setup.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# This work is made available under the Nvidia Source Code License-NC.
+# To view a copy of this license, check out LICENSE.md
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+cxx_args = ["-fopenmp"]
+nvcc_args = []
+setup(
+    name="voxrender",
+    version="1.0.0",
+    ext_modules=[
+        CUDAExtension(
+            "voxlib",
+            [
+                "voxlib.cpp",
+                "ray_voxel_intersection.cu",
+            ],
+            extra_compile_args={"cxx": cxx_args, "nvcc": nvcc_args},
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)

citydreamer/extensions/voxlib/voxlib.cpp ADDED Viewed

	@@ -0,0 +1,21 @@

+// Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, check out LICENSE.md
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/extension.h>
+#include <vector>
+// Fast voxel traversal along rays
+std::vector<torch::Tensor> ray_voxel_intersection_perspective_cuda(
+    const torch::Tensor &in_voxel, const torch::Tensor &cam_ori,
+    const torch::Tensor &cam_dir, const torch::Tensor &cam_up, float cam_f,
+    const std::vector<float> &cam_c, const std::vector<int> &img_dims,
+    int max_samples);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ray_voxel_intersection_perspective",
+        &ray_voxel_intersection_perspective_cuda,
+        "Ray-voxel intersections given perspective camera parameters (CUDA)");
+}

citydreamer/extensions/voxlib/voxlib_common.h ADDED Viewed

	@@ -0,0 +1,83 @@

+// Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, check out LICENSE.md
+#ifndef VOXLIB_COMMON_H
+#define VOXLIB_COMMON_H
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU(x)                                                           \
+  TORCH_CHECK(x.device().is_cpu(), #x " must be a CPU tensor")
+#include <cuda.h>
+#include <cuda_runtime.h>
+// CUDA vector math functions
+__host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+  if (c * b > a) {
+    c--;
+  }
+  return c;
+}
+template <typename scalar_t>
+__host__ __forceinline__ void cross(scalar_t *r, const scalar_t *a,
+                                    const scalar_t *b) {
+  r[0] = a[1] * b[2] - a[2] * b[1];
+  r[1] = a[2] * b[0] - a[0] * b[2];
+  r[2] = a[0] * b[1] - a[1] * b[0];
+}
+__device__ __host__ __forceinline__ float dot(const float *a, const float *b) {
+  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+template <typename scalar_t, int ndim>
+__device__ __host__ __forceinline__ void copyarr(scalar_t *r,
+                                                 const scalar_t *a) {
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    r[i] = a[i];
+  }
+}
+// TODO: use rsqrt to speed up
+// inplace version
+template <typename scalar_t, int ndim>
+__device__ __host__ __forceinline__ void normalize(scalar_t *a) {
+  scalar_t vec_len = 0.0f;
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    vec_len += a[i] * a[i];
+  }
+  vec_len = sqrtf(vec_len);
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    a[i] /= vec_len;
+  }
+}
+// normalize + copy
+template <typename scalar_t, int ndim>
+__device__ __host__ __forceinline__ void normalize(scalar_t *r,
+                                                   const scalar_t *a) {
+  scalar_t vec_len = 0.0f;
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    vec_len += a[i] * a[i];
+  }
+  vec_len = sqrtf(vec_len);
+#pragma unroll
+  for (int i = 0; i < ndim; i++) {
+    r[i] = a[i] / vec_len;
+  }
+}
+#endif // VOXLIB_COMMON_H

citydreamer/inference.py ADDED Viewed

	@@ -0,0 +1,537 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   inference.py
+# @Author: Haozhe Xie
+# @Date:   2024-03-02 16:30:00
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-03-03 12:10:18
+# @Email:  [email protected]
+import copy
+import cv2
+import logging
+import math
+import numpy as np
+import torch
+import torchvision
+import citydreamer.extensions.extrude_tensor
+import citydreamer.extensions.voxlib
+# Global constants
+HEIGHTS = {
+    "ROAD": 4,
+    "GREEN_LANDS": 8,
+    "CONSTRUCTION": 10,
+    "COAST_ZONES": 0,
+    "ROOF": 1,
+}
+CLASSES = {
+    "NULL": 0,
+    "ROAD": 1,
+    "BLD_FACADE": 2,
+    "GREEN_LANDS": 3,
+    "CONSTRUCTION": 4,
+    "COAST_ZONES": 5,
+    "OTHERS": 6,
+    "BLD_ROOF": 7,
+}
+# NOTE: ID > 10 are reserved for building instances.
+# Assume the ID of a facade instance is 2k, the corresponding roof instance is 2k - 1.
+CONSTANTS = {
+    "BLD_INS_LABEL_MIN": 10,
+    "LAYOUT_N_CLASSES": 7,
+    "LAYOUT_VOL_SIZE": 1536,
+    "BUILDING_VOL_SIZE": 672,
+    "EXTENDED_VOL_SIZE": 2880,
+    "LAYOUT_MAX_HEIGHT": 640,
+    "GES_VFOV": 20,
+    "GES_IMAGE_HEIGHT": 540,
+    "GES_IMAGE_WIDTH": 960,
+    "IMAGE_PADDING": 8,
+    "N_VOXEL_INTERSECT_SAMPLES": 6,
+}
+def generate_city(fgm, bgm, hf, seg, radius, altitude, azimuth):
+    cam_pos = get_orbit_camera_position(radius, altitude, azimuth)
+    seg, building_stats = get_instance_seg_map(seg)
+    # Generate latent codes
+    logging.info("Generating latent codes ...")
+    bg_z, building_zs = get_latent_codes(
+        building_stats,
+        bgm.module.cfg.NETWORK.GANCRAFT.STYLE_DIM,
+        bgm.output_device,
+    )
+    # Random choose the center of the patch
+    cy = (
+        np.random.randint(seg.shape[0] - CONSTANTS["EXTENDED_VOL_SIZE"])
+        + CONSTANTS["EXTENDED_VOL_SIZE"] // 2
+    )
+    cx = (
+        np.random.randint(seg.shape[1] - CONSTANTS["EXTENDED_VOL_SIZE"])
+        + CONSTANTS["EXTENDED_VOL_SIZE"] // 2
+    )
+    # Generate local image patch of the height field and seg map
+    part_hf, part_seg = get_part_hf_seg(hf, seg, cx, cy, CONSTANTS["EXTENDED_VOL_SIZE"])
+    # Generate local image patch of the height field and seg map
+    part_hf, part_seg = get_part_hf_seg(hf, seg, cx, cy, CONSTANTS["EXTENDED_VOL_SIZE"])
+    # print(part_hf.shape)    # (2880, 2880)
+    # print(part_seg.shape)   # (2880, 2880)
+    # Recalculate the building positions based on the current patch
+    _building_stats = get_part_building_stats(part_seg, building_stats, cx, cy)
+    # Generate the concatenated height field and seg. map tensor
+    hf_seg = get_hf_seg_tensor(part_hf, part_seg, bgm.output_device)
+    # print(hf_seg.size())    # torch.Size([1, 8, 2880, 2880])
+    # Build seg_volume
+    logging.info("Generating seg volume ...")
+    seg_volume = get_seg_volume(part_hf, part_seg)
+    logging.info("Rendering City Image ...")
+    img = render(
+        (CONSTANTS["GES_IMAGE_HEIGHT"] // 5, CONSTANTS["GES_IMAGE_WIDTH"] // 5),
+        seg_volume,
+        hf_seg,
+        cam_pos,
+        bgm,
+        fgm,
+        _building_stats,
+        bg_z,
+        building_zs,
+    )
+    return ((img.cpu().numpy().squeeze().transpose((1, 2, 0)) / 2 + 0.5) * 255).astype(
+        np.uint8
+    )
+def get_orbit_camera_position(radius, altitude, azimuth):
+    cx = CONSTANTS["LAYOUT_VOL_SIZE"] // 2
+    cy = cx
+    theta = np.deg2rad(azimuth)
+    cam_x = cx + radius * math.cos(theta)
+    cam_y = cy + radius * math.sin(theta)
+    return {"x": cam_x, "y": cam_y, "z": altitude}
+def get_instance_seg_map(seg_map):
+    # Mapping constructions to buildings
+    seg_map[seg_map == CLASSES["CONSTRUCTION"]] = CLASSES["BLD_FACADE"]
+    # Use connected components to get building instances
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(
+        (seg_map == CLASSES["BLD_FACADE"]).astype(np.uint8), connectivity=4
+    )
+    # Remove non-building instance masks
+    labels[seg_map != CLASSES["BLD_FACADE"]] = 0
+    # Building instance mask
+    building_mask = labels != 0
+    # Make building instance IDs are even numbers and start from 10
+    # Assume the ID of a facade instance is 2k, the corresponding roof instance is 2k - 1.
+    labels = (labels + CONSTANTS["BLD_INS_LABEL_MIN"]) * 2
+    seg_map[seg_map == CLASSES["BLD_FACADE"]] = 0
+    seg_map = seg_map * (1 - building_mask) + labels * building_mask
+    assert np.max(labels) < 2147483648
+    return seg_map.astype(np.int32), stats[:, :4]
+def get_latent_codes(building_stats, bg_style_dim, output_device):
+    bg_z = _get_z(output_device, bg_style_dim)
+    building_zs = {
+        (i + CONSTANTS["BLD_INS_LABEL_MIN"]) * 2: _get_z(output_device)
+        for i in range(len(building_stats))
+    }
+    return bg_z, building_zs
+def _get_z(device, z_dim=256):
+    if z_dim is None:
+        return None
+    return torch.randn(1, z_dim, dtype=torch.float32, device=device)
+def get_part_hf_seg(hf, seg, cx, cy, patch_size):
+    part_hf = _get_image_patch(hf, cx, cy, patch_size)
+    part_seg = _get_image_patch(seg, cx, cy, patch_size)
+    assert part_hf.shape == (
+        patch_size,
+        patch_size,
+    ), part_hf.shape
+    assert part_hf.shape == part_seg.shape, part_seg.shape
+    return part_hf, part_seg
+def _get_image_patch(image, cx, cy, patch_size):
+    sx = cx - patch_size // 2
+    sy = cy - patch_size // 2
+    ex = sx + patch_size
+    ey = sy + patch_size
+    return image[sy:ey, sx:ex]
+def get_part_building_stats(part_seg, building_stats, cx, cy):
+    _buildings = np.unique(part_seg[part_seg > CONSTANTS["BLD_INS_LABEL_MIN"]])
+    _building_stats = {}
+    for b in _buildings:
+        _b = b // 2 - CONSTANTS["BLD_INS_LABEL_MIN"]
+        _building_stats[b] = [
+            building_stats[_b, 1] - cy + building_stats[_b, 3] / 2,
+            building_stats[_b, 0] - cx + building_stats[_b, 2] / 2,
+        ]
+    return _building_stats
+def get_hf_seg_tensor(part_hf, part_seg, output_device):
+    part_hf = torch.from_numpy(part_hf[None, None, ...]).to(output_device)
+    part_seg = torch.from_numpy(part_seg[None, None, ...]).to(output_device)
+    part_hf = part_hf / CONSTANTS["LAYOUT_MAX_HEIGHT"]
+    part_seg = _masks_to_onehots(part_seg[:, 0, :, :], CONSTANTS["LAYOUT_N_CLASSES"])
+    return torch.cat([part_hf, part_seg], dim=1)
+def _masks_to_onehots(masks, n_class, ignored_classes=[]):
+    b, h, w = masks.shape
+    n_class_actual = n_class - len(ignored_classes)
+    one_hot_masks = torch.zeros(
+        (b, n_class_actual, h, w), dtype=torch.float32, device=masks.device
+    )
+    n_class_cnt = 0
+    for i in range(n_class):
+        if i not in ignored_classes:
+            one_hot_masks[:, n_class_cnt] = masks == i
+            n_class_cnt += 1
+    return one_hot_masks
+def get_seg_volume(part_hf, part_seg):
+    tensor_extruder = citydreamer.extensions.extrude_tensor.TensorExtruder(
+        CONSTANTS["LAYOUT_MAX_HEIGHT"]
+    )
+    if part_hf.shape == (
+        CONSTANTS["EXTENDED_VOL_SIZE"],
+        CONSTANTS["EXTENDED_VOL_SIZE"],
+    ):
+        part_hf = part_hf[
+            CONSTANTS["BUILDING_VOL_SIZE"] : -CONSTANTS["BUILDING_VOL_SIZE"],
+            CONSTANTS["BUILDING_VOL_SIZE"] : -CONSTANTS["BUILDING_VOL_SIZE"],
+        ]
+        # print(part_hf.shape)  # torch.Size([1, 8, 1536, 1536])
+        part_seg = part_seg[
+            CONSTANTS["BUILDING_VOL_SIZE"] : -CONSTANTS["BUILDING_VOL_SIZE"],
+            CONSTANTS["BUILDING_VOL_SIZE"] : -CONSTANTS["BUILDING_VOL_SIZE"],
+        ]
+        # print(part_seg.shape)  # torch.Size([1, 8, 1536, 1536])
+    assert part_hf.shape == (
+        CONSTANTS["LAYOUT_VOL_SIZE"],
+        CONSTANTS["LAYOUT_VOL_SIZE"],
+    )
+    assert part_hf.shape == part_seg.shape, part_seg.shape
+    seg_volume = tensor_extruder(
+        torch.from_numpy(part_seg[None, None, ...]).cuda(),
+        torch.from_numpy(part_hf[None, None, ...]).cuda(),
+    ).squeeze()
+    logging.debug("The shape of SegVolume: %s" % (seg_volume.size(),))
+    # Change the top-level voxel of the "Building Facade" to "Building Roof"
+    roof_seg_map = part_seg.copy()
+    non_roof_msk = part_seg <= CONSTANTS["BLD_INS_LABEL_MIN"]
+    # Assume the ID of a facade instance is 2k, the corresponding roof instance is 2k - 1.
+    roof_seg_map = roof_seg_map - 1
+    roof_seg_map[non_roof_msk] = 0
+    for rh in range(1, HEIGHTS["ROOF"] + 1):
+        seg_volume = seg_volume.scatter_(
+            dim=2,
+            index=torch.from_numpy(part_hf[..., None] + rh).long().cuda(),
+            src=torch.from_numpy(roof_seg_map[..., None]).cuda(),
+        )
+    # print(seg_volume.size())  # torch.Size([1536, 1536, 640])
+    return seg_volume
+def get_voxel_intersection_perspective(seg_volume, camera_location):
+    CAMERA_FOCAL = (
+        CONSTANTS["GES_IMAGE_HEIGHT"] / 2 / np.tan(np.deg2rad(CONSTANTS["GES_VFOV"]))
+    )
+    # print(seg_volume.size())  # torch.Size([1536, 1536, 640])
+    camera_target = {
+        "x": seg_volume.size(1) // 2 - 1,
+        "y": seg_volume.size(0) // 2 - 1,
+    }
+    cam_origin = torch.tensor(
+        [
+            camera_location["y"],
+            camera_location["x"],
+            camera_location["z"],
+        ],
+        dtype=torch.float32,
+        device=seg_volume.device,
+    )
+    (
+        voxel_id,
+        depth2,
+        raydirs,
+    ) = citydreamer.extensions.voxlib.ray_voxel_intersection_perspective(
+        seg_volume,
+        cam_origin,
+        torch.tensor(
+            [
+                camera_target["y"] - camera_location["y"],
+                camera_target["x"] - camera_location["x"],
+                -camera_location["z"],
+            ],
+            dtype=torch.float32,
+            device=seg_volume.device,
+        ),
+        torch.tensor([0, 0, 1], dtype=torch.float32),
+        CAMERA_FOCAL * 2.06,
+        [
+            (CONSTANTS["GES_IMAGE_HEIGHT"] - 1) / 2.0,
+            (CONSTANTS["GES_IMAGE_WIDTH"] - 1) / 2.0,
+        ],
+        [CONSTANTS["GES_IMAGE_HEIGHT"], CONSTANTS["GES_IMAGE_WIDTH"]],
+        CONSTANTS["N_VOXEL_INTERSECT_SAMPLES"],
+    )
+    return (
+        voxel_id.unsqueeze(dim=0),
+        depth2.permute(1, 2, 0, 3, 4).unsqueeze(dim=0),
+        raydirs.unsqueeze(dim=0),
+        cam_origin.unsqueeze(dim=0),
+    )
+def _get_pad_img_bbox(sx, ex, sy, ey):
+    psx = sx - CONSTANTS["IMAGE_PADDING"] if sx != 0 else 0
+    psy = sy - CONSTANTS["IMAGE_PADDING"] if sy != 0 else 0
+    pex = (
+        ex + CONSTANTS["IMAGE_PADDING"]
+        if ex != CONSTANTS["GES_IMAGE_WIDTH"]
+        else CONSTANTS["GES_IMAGE_WIDTH"]
+    )
+    pey = (
+        ey + CONSTANTS["IMAGE_PADDING"]
+        if ey != CONSTANTS["GES_IMAGE_HEIGHT"]
+        else CONSTANTS["GES_IMAGE_HEIGHT"]
+    )
+    return psx, pex, psy, pey
+def _get_img_without_pad(img, sx, ex, sy, ey, psx, pex, psy, pey):
+    if CONSTANTS["IMAGE_PADDING"] == 0:
+        return img
+    return img[
+        :,
+        :,
+        sy - psy : ey - pey if ey != pey else ey,
+        sx - psx : ex - pex if ex != pex else ex,
+    ]
+def render_bg(
+    patch_size, gancraft_bg, hf_seg, voxel_id, depth2, raydirs, cam_origin, z
+):
+    assert hf_seg.size(2) == CONSTANTS["EXTENDED_VOL_SIZE"]
+    assert hf_seg.size(3) == CONSTANTS["EXTENDED_VOL_SIZE"]
+    hf_seg = hf_seg[
+        :,
+        :,
+        CONSTANTS["BUILDING_VOL_SIZE"] : -CONSTANTS["BUILDING_VOL_SIZE"],
+        CONSTANTS["BUILDING_VOL_SIZE"] : -CONSTANTS["BUILDING_VOL_SIZE"],
+    ]
+    assert hf_seg.size(2) == CONSTANTS["LAYOUT_VOL_SIZE"]
+    assert hf_seg.size(3) == CONSTANTS["LAYOUT_VOL_SIZE"]
+    blurrer = torchvision.transforms.GaussianBlur(kernel_size=3, sigma=(2, 2))
+    _voxel_id = copy.deepcopy(voxel_id)
+    _voxel_id[voxel_id >= CONSTANTS["BLD_INS_LABEL_MIN"]] = CLASSES["BLD_FACADE"]
+    assert (_voxel_id < CONSTANTS["LAYOUT_N_CLASSES"]).all()
+    bg_img = torch.zeros(
+        1,
+        3,
+        CONSTANTS["GES_IMAGE_HEIGHT"],
+        CONSTANTS["GES_IMAGE_WIDTH"],
+        dtype=torch.float32,
+        device=gancraft_bg.output_device,
+    )
+    # Render background patches by patch to avoid OOM
+    for i in range(CONSTANTS["GES_IMAGE_HEIGHT"] // patch_size[0]):
+        for j in range(CONSTANTS["GES_IMAGE_WIDTH"] // patch_size[1]):
+            sy, sx = i * patch_size[0], j * patch_size[1]
+            ey, ex = sy + patch_size[0], sx + patch_size[1]
+            psx, pex, psy, pey = _get_pad_img_bbox(sx, ex, sy, ey)
+            output_bg = gancraft_bg(
+                hf_seg=hf_seg,
+                voxel_id=_voxel_id[:, psy:pey, psx:pex],
+                depth2=depth2[:, psy:pey, psx:pex],
+                raydirs=raydirs[:, psy:pey, psx:pex],
+                cam_origin=cam_origin,
+                building_stats=None,
+                z=z,
+                deterministic=True,
+            )
+            # Make road blurry
+            road_mask = (
+                (_voxel_id[:, None, psy:pey, psx:pex, 0, 0] == CLASSES["ROAD"])
+                .repeat(1, 3, 1, 1)
+                .float()
+            )
+            output_bg = blurrer(output_bg) * road_mask + output_bg * (1 - road_mask)
+            bg_img[:, :, sy:ey, sx:ex] = _get_img_without_pad(
+                output_bg, sx, ex, sy, ey, psx, pex, psy, pey
+            )
+    return bg_img
+def render_fg(
+    patch_size,
+    gancraft_fg,
+    building_id,
+    hf_seg,
+    voxel_id,
+    depth2,
+    raydirs,
+    cam_origin,
+    building_stats,
+    building_z,
+):
+    _voxel_id = copy.deepcopy(voxel_id)
+    _curr_bld = torch.tensor([building_id, building_id - 1], device=voxel_id.device)
+    _voxel_id[~torch.isin(_voxel_id, _curr_bld)] = 0
+    _voxel_id[voxel_id == building_id] = CLASSES["BLD_FACADE"]
+    _voxel_id[voxel_id == building_id - 1] = CLASSES["BLD_ROOF"]
+    # assert (_voxel_id < CONSTANTS["LAYOUT_N_CLASSES"]).all()
+    _hf_seg = copy.deepcopy(hf_seg)
+    _hf_seg[hf_seg != building_id] = 0
+    _hf_seg[hf_seg == building_id] = CLASSES["BLD_FACADE"]
+    _raydirs = copy.deepcopy(raydirs)
+    _raydirs[_voxel_id[..., 0, 0] == 0] = 0
+    # Crop the "hf_seg" image using the center of the target building as the reference
+    cx = CONSTANTS["EXTENDED_VOL_SIZE"] // 2 - int(building_stats[1])
+    cy = CONSTANTS["EXTENDED_VOL_SIZE"] // 2 - int(building_stats[0])
+    sx = cx - CONSTANTS["BUILDING_VOL_SIZE"] // 2
+    ex = cx + CONSTANTS["BUILDING_VOL_SIZE"] // 2
+    sy = cy - CONSTANTS["BUILDING_VOL_SIZE"] // 2
+    ey = cy + CONSTANTS["BUILDING_VOL_SIZE"] // 2
+    _hf_seg = hf_seg[:, :, sy:ey, sx:ex]
+    fg_img = torch.zeros(
+        1,
+        3,
+        CONSTANTS["GES_IMAGE_HEIGHT"],
+        CONSTANTS["GES_IMAGE_WIDTH"],
+        dtype=torch.float32,
+        device=gancraft_fg.output_device,
+    )
+    fg_mask = torch.zeros(
+        1,
+        1,
+        CONSTANTS["GES_IMAGE_HEIGHT"],
+        CONSTANTS["GES_IMAGE_WIDTH"],
+        dtype=torch.float32,
+        device=gancraft_fg.output_device,
+    )
+    # Prevent some buildings are out of bound.
+    # THIS SHOULD NEVER HAPPEN AGAIN.
+    # if (
+    #     _hf_seg.size(2) != CONSTANTS["BUILDING_VOL_SIZE"]
+    #     or _hf_seg.size(3) != CONSTANTS["BUILDING_VOL_SIZE"]
+    # ):
+    #     return fg_img, fg_mask
+    # Render foreground patches by patch to avoid OOM
+    for i in range(CONSTANTS["GES_IMAGE_HEIGHT"] // patch_size[0]):
+        for j in range(CONSTANTS["GES_IMAGE_WIDTH"] // patch_size[1]):
+            sy, sx = i * patch_size[0], j * patch_size[1]
+            ey, ex = sy + patch_size[0], sx + patch_size[1]
+            psx, pex, psy, pey = _get_pad_img_bbox(sx, ex, sy, ey)
+            if torch.count_nonzero(_raydirs[:, sy:ey, sx:ex]) > 0:
+                output_fg = gancraft_fg(
+                    _hf_seg,
+                    _voxel_id[:, psy:pey, psx:pex],
+                    depth2[:, psy:pey, psx:pex],
+                    _raydirs[:, psy:pey, psx:pex],
+                    cam_origin,
+                    building_stats=torch.from_numpy(np.array(building_stats)).unsqueeze(
+                        dim=0
+                    ),
+                    z=building_z,
+                    deterministic=True,
+                )
+                facade_mask = (
+                    voxel_id[:, sy:ey, sx:ex, 0, 0] == building_id
+                ).unsqueeze(dim=1)
+                roof_mask = (
+                    voxel_id[:, sy:ey, sx:ex, 0, 0] == building_id - 1
+                ).unsqueeze(dim=1)
+                facade_img = facade_mask * _get_img_without_pad(
+                    output_fg, sx, ex, sy, ey, psx, pex, psy, pey
+                )
+                # Make roof blurry
+                # output_fg = F.interpolate(
+                #     F.interpolate(output_fg * 0.8, scale_factor=0.75),
+                #     scale_factor=4 / 3,
+                # ),
+                roof_img = roof_mask * _get_img_without_pad(
+                    output_fg,
+                    sx,
+                    ex,
+                    sy,
+                    ey,
+                    psx,
+                    pex,
+                    psy,
+                    pey,
+                )
+                fg_mask[:, :, sy:ey, sx:ex] = torch.logical_or(facade_mask, roof_mask)
+                fg_img[:, :, sy:ey, sx:ex] = (
+                    facade_img * facade_mask + roof_img * roof_mask
+                )
+    return fg_img, fg_mask
+def render(
+    patch_size,
+    seg_volume,
+    hf_seg,
+    cam_pos,
+    gancraft_bg,
+    gancraft_fg,
+    building_stats,
+    bg_z,
+    building_zs,
+):
+    voxel_id, depth2, raydirs, cam_origin = get_voxel_intersection_perspective(
+        seg_volume, cam_pos
+    )
+    buildings = torch.unique(voxel_id[voxel_id > CONSTANTS["BLD_INS_LABEL_MIN"]])
+    # Remove odd numbers from the list because they are reserved by roofs.
+    buildings = buildings[buildings % 2 == 0]
+    with torch.no_grad():
+        bg_img = render_bg(
+            patch_size, gancraft_bg, hf_seg, voxel_id, depth2, raydirs, cam_origin, bg_z
+        )
+        for b in buildings:
+            assert b % 2 == 0, "Building Instance ID MUST be an even number."
+            fg_img, fg_mask = render_fg(
+                patch_size,
+                gancraft_fg,
+                b.item(),
+                hf_seg,
+                voxel_id,
+                depth2,
+                raydirs,
+                cam_origin,
+                building_stats[b.item()],
+                building_zs[b.item()],
+            )
+            bg_img = bg_img * (1 - fg_mask) + fg_img * fg_mask
+    return bg_img

citydreamer/model.py ADDED Viewed

	@@ -0,0 +1,1264 @@

+# -*- coding: utf-8 -*-
+#
+# @File:   gancraft.py
+# @Author: Haozhe Xie
+# @Date:   2023-04-12 19:53:21
+# @Last Modified by: Haozhe Xie
+# @Last Modified at: 2024-03-03 11:15:36
+# @Email:  [email protected]
+# @Ref: https://github.com/FrozenBurning/SceneDreamer
+import numpy as np
+import torch
+import torch.nn.functional as F
+import citydreamer.extensions.grid_encoder
+class GanCraftGenerator(torch.nn.Module):
+    def __init__(self, cfg):
+        super(GanCraftGenerator, self).__init__()
+        self.cfg = cfg
+        self.render_net = RenderMLP(cfg)
+        self.denoiser = RenderCNN(cfg)
+        if cfg.NETWORK.GANCRAFT.ENCODER == "GLOBAL":
+            self.encoder = GlobalEncoder(cfg)
+        elif cfg.NETWORK.GANCRAFT.ENCODER == "LOCAL":
+            self.encoder = LocalEncoder(cfg)
+        else:
+            self.encoder = None
+        if (
+            not cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS
+            and not cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+        ):
+            raise ValueError(
+                "Either POS_EMD_INCUDE_CORDS or POS_EMD_INCUDE_FEATURES should be True."
+            )
+        if cfg.NETWORK.GANCRAFT.POS_EMD == "HASH_GRID":
+            grid_encoder_in_dim = 3 if cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS else 0
+            if (
+                cfg.NETWORK.GANCRAFT.ENCODER in ["GLOBAL", "LOCAL"]
+                and cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+            ):
+                grid_encoder_in_dim += cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM
+            self.pos_encoder = citydreamer.extensions.grid_encoder.GridEncoder(
+                in_channels=grid_encoder_in_dim,
+                n_levels=cfg.NETWORK.GANCRAFT.HASH_GRID_N_LEVELS,
+                lvl_channels=cfg.NETWORK.GANCRAFT.HASH_GRID_LEVEL_DIM,
+                desired_resolution=cfg.NETWORK.GANCRAFT.HASH_GRID_RESOLUTION,
+            )
+        elif cfg.NETWORK.GANCRAFT.POS_EMD == "SIN_COS":
+            self.pos_encoder = SinCosEncoder(cfg)
+    def forward(
+        self,
+        hf_seg,
+        voxel_id,
+        depth2,
+        raydirs,
+        cam_origin,
+        building_stats=None,
+        z=None,
+        deterministic=False,
+    ):
+        r"""GANcraft Generator forward.
+        Args:
+            hf_seg (N x (1 + M) x H' x W' tensor) : height field + seg map, where M is the number of classes.
+            voxel_id (N x H x W x max_samples x 1 tensor): IDs of intersected tensors along each ray.
+            depth2 (N x H x W x 2 x max_samples x 1 tensor): Depths of entrance and exit points for each ray-voxel
+            intersection.
+            raydirs (N x H x W x 1 x 3 tensor): The direction of each ray.
+            cam_origin (N x 3 tensor): Camera origins.
+            building_stats (N x 5 tensor): The dy, dx, h, w, ID of the target building. (Only used in building mode)
+            z (N x STYLE_DIM tensor): The style vector.
+            deterministic (bool): Whether to use equal-distance sampling instead of random stratified sampling.
+        Returns:
+            fake_images (N x 3 x H x W tensor): fake images
+        """
+        bs, device = hf_seg.size(0), hf_seg.device
+        if z is None and self.cfg.NETWORK.GANCRAFT.STYLE_DIM is not None:
+            z = torch.randn(
+                bs,
+                self.cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                dtype=torch.float32,
+                device=device,
+            )
+        features = None
+        if self.encoder is not None:
+            features = self.encoder(hf_seg)
+        net_out = self._forward_perpix(
+            features,
+            voxel_id,
+            depth2,
+            raydirs,
+            cam_origin,
+            z,
+            building_stats,
+            deterministic,
+        )
+        fake_images = self._forward_global(net_out, z)
+        return fake_images
+    def _forward_perpix(
+        self,
+        features,
+        voxel_id,
+        depth2,
+        raydirs,
+        cam_origin,
+        z,
+        building_stats=None,
+        deterministic=False,
+    ):
+        r"""Sample points along rays, forwarding the per-point MLP and aggregate pixel features
+        Args:
+            features (N x C1 tensor): Local features determined by the current pixel.
+            voxel_id (N x H x W x M x 1 tensor): Voxel ids from ray-voxel intersection test. M: num intersected voxels
+            depth2 (N x H x W x 2 x M x 1 tensor): Depths of entrance and exit points for each ray-voxel intersection.
+            raydirs (N x H x W x 1 x 3 tensor): The direction of each ray.
+            cam_origin (N x 3 tensor): Camera origins.
+            z (N x C3 tensor): Intermediate style vectors.
+            building_stats (N x 4 tensor): The dy, dx, h, w of the target building. (Only used in building mode)
+            deterministic (bool): Whether to use equal-distance sampling instead of random stratified sampling.
+        """
+        # Generate sky_mask; PE transform on ray direction.
+        with torch.no_grad():
+            # sky_only_mask: when True, ray hits nothing but sky
+            sky_only_mask = voxel_id[:, :, :, [0], :] == 0
+        with torch.no_grad():
+            normalized_cord, new_dists, new_idx = self._get_sampled_coordinates(
+                self.cfg.NETWORK.GANCRAFT.N_SAMPLE_POINTS_PER_RAY,
+                depth2,
+                raydirs,
+                cam_origin,
+                building_stats,
+                deterministic,
+            )
+            # Generate per-sample segmentation label
+            seg_map_bev = torch.gather(voxel_id, -2, new_idx)
+            # print(seg_map_bev.size())  # torch.Size([N, H, W, n_samples + 1, 1])
+            # In Building Mode, the one more channel is used for building roofs
+            n_classes = (
+                self.cfg.NETWORK.GANCRAFT.N_CLASSES + 1
+                if self.cfg.NETWORK.GANCRAFT.BUILDING_MODE
+                else self.cfg.NETWORK.GANCRAFT.N_CLASSES
+            )
+            seg_map_bev_onehot = torch.zeros(
+                [
+                    seg_map_bev.size(0),
+                    seg_map_bev.size(1),
+                    seg_map_bev.size(2),
+                    seg_map_bev.size(3),
+                    n_classes,
+                ],
+                dtype=torch.float,
+                device=voxel_id.device,
+            )
+            # print(seg_map_bev_onehot.size())  # torch.Size([N, H, W, n_samples + 1, 1])
+            seg_map_bev_onehot.scatter_(-1, seg_map_bev.long(), 1.0)
+        net_out_s, net_out_c = self._forward_perpix_sub(
+            features, normalized_cord, z, seg_map_bev_onehot
+        )
+        # Blending
+        weights = self._volum_rendering_relu(
+            net_out_s, new_dists * self.cfg.NETWORK.GANCRAFT.DIST_SCALE, dim=-2
+        )
+        # If a ray exclusively hits the sky (no intersection with the voxels), set its weight to zero.
+        weights = weights * torch.logical_not(sky_only_mask).float()
+        # print(weights.size())   # torch.Size([N, H, W, n_samples + 1, 1])
+        rgbs = torch.clamp(net_out_c, -1, 1) + 1
+        net_out = torch.sum(weights * rgbs, dim=-2, keepdim=True)
+        net_out = net_out.squeeze(-2)
+        net_out = net_out - 1
+        return net_out
+    def _get_sampled_coordinates(
+        self,
+        n_samples,
+        depth2,
+        raydirs,
+        cam_origin,
+        building_stats=None,
+        deterministic=False,
+    ):
+        # Random sample points along the ray
+        rand_depth, new_dists, new_idx = self._sample_depth_batched(
+            depth2,
+            n_samples + 1,
+            deterministic=deterministic,
+            use_box_boundaries=False,
+            sample_depth=3,
+        )
+        nan_mask = torch.isnan(rand_depth)
+        inf_mask = torch.isinf(rand_depth)
+        rand_depth[nan_mask | inf_mask] = 0.0
+        world_coord = raydirs * rand_depth + cam_origin[:, None, None, None, :]
+        # assert worldcoord2.shape[-1] == 3
+        if self.cfg.NETWORK.GANCRAFT.BUILDING_MODE:
+            assert building_stats is not None
+            # Make the building object-centric
+            building_stats = building_stats[:, None, None, None, :].repeat(
+                1, world_coord.size(1), world_coord.size(2), world_coord.size(3), 1
+            )
+            world_coord[..., 0] -= (
+                building_stats[..., 0] + self.cfg.NETWORK.GANCRAFT.CENTER_OFFSET
+            )
+            world_coord[..., 1] -= (
+                building_stats[..., 1] + self.cfg.NETWORK.GANCRAFT.CENTER_OFFSET
+            )
+            # TODO: Fix non-building rays
+            zero_rd_mask = raydirs.repeat(1, 1, 1, n_samples, 1)
+            world_coord[zero_rd_mask == 0] = 0
+        normalized_cord = self._get_normalized_coordinates(world_coord)
+        return normalized_cord, new_dists, new_idx
+    def _get_normalized_coordinates(self, world_coord):
+        delimeter = torch.tensor(
+            self.cfg.NETWORK.GANCRAFT.NORMALIZE_DELIMETER, device=world_coord.device
+        )
+        normalized_cord = world_coord / delimeter * 2 - 1
+        # TODO: Temporary fix
+        normalized_cord[normalized_cord > 1] = 1
+        normalized_cord[normalized_cord < -1] = -1
+        # assert (normalized_cord <= 1).all()
+        # assert (normalized_cord >= -1).all()
+        # print(delimeter, torch.min(normalized_cord), torch.max(normalized_cord))
+        # print(normalized_cord.size())   # torch.Size([1, 192, 192, 24, 3])
+        return normalized_cord
+    def _sample_depth_batched(
+        self,
+        depth2,
+        n_samples,
+        deterministic=False,
+        use_box_boundaries=True,
+        sample_depth=3,
+    ):
+        r"""Make best effort to sample points within the same distance for every ray.
+        Exception: When there is not enough voxel.
+        Args:
+            depth2 (N x H x W x 2 x M x 1 tensor):
+            - N: Batch.
+            - H, W: Height, Width.
+            - 2: Entrance / exit depth for each intersected box.
+            - M: Number of intersected boxes along the ray.
+            - 1: One extra dim for consistent tensor dims.
+            depth2 can include NaNs.
+            deterministic (bool): Whether to use equal-distance sampling instead of random stratified sampling.
+            use_box_boundaries (bool): Whether to add the entrance / exit points into the sample.
+            sample_depth (float): Truncate the ray when it travels further than sample_depth inside voxels.
+        """
+        bs = depth2.size(0)
+        dim0 = depth2.size(1)
+        dim1 = depth2.size(2)
+        dists = depth2[:, :, :, 1] - depth2[:, :, :, 0]
+        dists[torch.isnan(dists)] = 0
+        # print(dists.size())  # torch.Size([N, H, W, M, 1])
+        accu_depth = torch.cumsum(dists, dim=-2)
+        # print(accu_depth.size())  # torch.Size([N, H, W, M, 1])
+        total_depth = accu_depth[..., [-1], :]
+        # print(total_depth.size())  # torch.Size([N, H, W, 1, 1])
+        total_depth = torch.clamp(total_depth, None, sample_depth)
+        # Ignore out of range box boundaries. Fill with random samples.
+        if use_box_boundaries:
+            boundary_samples = accu_depth.clone().detach()
+            boundary_samples_filler = torch.rand_like(boundary_samples) * total_depth
+            bad_mask = (accu_depth > sample_depth) | (dists == 0)
+            boundary_samples[bad_mask] = boundary_samples_filler[bad_mask]
+        rand_shape = [bs, dim0, dim1, n_samples, 1]
+        if deterministic:
+            rand_samples = torch.empty(
+                rand_shape, dtype=total_depth.dtype, device=total_depth.device
+            )
+            rand_samples[..., :, 0] = torch.linspace(0, 1, n_samples + 2)[1:-1]
+        else:
+            rand_samples = torch.rand(
+                rand_shape, dtype=total_depth.dtype, device=total_depth.device
+            )
+            # Stratified sampling as in NeRF
+            rand_samples = rand_samples / n_samples
+            rand_samples[..., :, 0] += torch.linspace(
+                0, 1, n_samples + 1, device=rand_samples.device
+            )[:-1]
+        rand_samples = rand_samples * total_depth
+        # print(rand_samples.size())  # torch.Size([N, H, W, n_samples, 1])
+        # Can also include boundaries
+        if use_box_boundaries:
+            rand_samples = torch.cat(
+                [
+                    rand_samples,
+                    boundary_samples,
+                    torch.zeros(
+                        [bs, dim0, dim1, 1, 1],
+                        dtype=total_depth.dtype,
+                        device=total_depth.device,
+                    ),
+                ],
+                dim=-2,
+            )
+        rand_samples, _ = torch.sort(rand_samples, dim=-2, descending=False)
+        midpoints = (rand_samples[..., 1:, :] + rand_samples[..., :-1, :]) / 2
+        # print(midpoints.size())  # torch.Size([N, H, W, n_samples, 1])
+        new_dists = rand_samples[..., 1:, :] - rand_samples[..., :-1, :]
+        # Scatter the random samples back
+        # print(midpoints.unsqueeze(-3).size())   # torch.Size([N, H, W, 1, n_samples, 1])
+        # print(accu_depth.unsqueeze(-2).size())  # torch.Size([N, H, W, M, 1, 1])
+        idx = torch.sum(midpoints.unsqueeze(-3) > accu_depth.unsqueeze(-2), dim=-3)
+        # print(idx.shape, idx.max(), idx.min()) # torch.Size([N, H, W, n_samples, 1]) max 5, min 0
+        depth_deltas = (
+            depth2[:, :, :, 0, 1:, :] - depth2[:, :, :, 1, :-1, :]
+        )  # There might be NaNs!
+        # print(depth_deltas.size())  # torch.Size([N, H, W, M, M - 1, 1])
+        depth_deltas = torch.cumsum(depth_deltas, dim=-2)
+        depth_deltas = torch.cat(
+            [depth2[:, :, :, 0, [0], :], depth_deltas + depth2[:, :, :, 0, [0], :]],
+            dim=-2,
+        )
+        heads = torch.gather(depth_deltas, -2, idx)
+        # print(heads.size())  # torch.Size([N, H, W, M, 1])
+        # print(torch.any(torch.isnan(heads)))
+        rand_depth = heads + midpoints
+        # print(rand_depth.size())  # torch.Size([N, H, W, M, n_samples, 1])
+        return rand_depth, new_dists, idx
+    def _volum_rendering_relu(self, sigma, dists, dim=2):
+        free_energy = F.relu(sigma) * dists
+        a = 1 - torch.exp(-free_energy.float())  # probability of it is not empty here
+        b = torch.exp(
+            -self._cumsum_exclusive(free_energy, dim=dim)
+        )  # probability of everything is empty up to now
+        return a * b  # probability of the ray hits something here
+    def _cumsum_exclusive(self, tensor, dim):
+        cumsum = torch.cumsum(tensor, dim)
+        cumsum = torch.roll(cumsum, 1, dim)
+        cumsum.index_fill_(
+            dim, torch.tensor([0], dtype=torch.long, device=tensor.device), 0
+        )
+        return cumsum
+    def _forward_perpix_sub(self, features, normalized_cord, z, seg_map_bev_onehot):
+        r"""Forwarding the MLP.
+        Args:
+            features (N x C1 x ...? tensor): Local features determined by the current pixel.
+            normalized_coord (N x H x W x L x 3 tensor): 3D world coordinates of sampled points. L is number of samples; N is batch size, always 1.
+            z (N x C3 tensor): Intermediate style vectors.
+            seg_map_bev_onehot (N x H x W x L x C4): One-hot segmentation maps.
+        Returns:
+            net_out_s (N x H x W x L x 1 tensor): Opacities.
+            net_out_c (N x H x W x L x C5 tensor): Color embeddings.
+        """
+        feature_in = torch.empty(
+            normalized_cord.size(0),
+            normalized_cord.size(1),
+            normalized_cord.size(2),
+            normalized_cord.size(3),
+            0,
+            device=normalized_cord.device,
+        )
+        if self.cfg.NETWORK.GANCRAFT.ENCODER == "GLOBAL":
+            # print(features.size())  # torch.Size([N, ENCODER_OUT_DIM])
+            feature_in = features[:, None, None, None, :].repeat(
+                1,
+                normalized_cord.size(1),
+                normalized_cord.size(2),
+                normalized_cord.size(3),
+                1,
+            )
+        elif self.cfg.NETWORK.GANCRAFT.ENCODER == "LOCAL":
+            # print(features.size())    # torch.Size([N, ENCODER_OUT_DIM - 1, H, W])
+            # print(world_coord.size()) # torch.Size([N, H, W, L, 3])
+            # NOTE: grid specifies the sampling pixel locations normalized by the input spatial
+            # dimensions. Therefore, it should have most values in the range of [-1, 1].
+            grid = normalized_cord.permute(0, 3, 1, 2, 4).reshape(
+                -1, normalized_cord.size(1), normalized_cord.size(2), 3
+            )
+            # print(grid.size())        # torch.Size([N * L, H, W, 3])
+            feature_in = F.grid_sample(
+                features.repeat(grid.size(0), 1, 1, 1),
+                grid[..., [1, 0]],
+                align_corners=False,
+            )
+            # print(feature_in.size())  # torch.Size([N * L, ENCODER_OUT_DIM - 1, H, W])
+            feature_in = feature_in.reshape(
+                normalized_cord.size(0),
+                normalized_cord.size(3),
+                feature_in.size(1),
+                feature_in.size(2),
+                feature_in.size(3),
+            ).permute(0, 3, 4, 1, 2)
+            # print(feature_in.size())  # torch.Size([N, H, W, L, ENCODER_OUT_DIM - 1])
+            feature_in = torch.cat([feature_in, normalized_cord[..., [2]]], dim=-1)
+            # print(feature_in.size())  # torch.Size([N, H, W, L, ENCODER_OUT_DIM])
+        if self.cfg.NETWORK.GANCRAFT.POS_EMD in ["HASH_GRID", "SIN_COS"]:
+            if (
+                self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS
+                and self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+            ):
+                feature_in = self.pos_encoder(
+                    torch.cat([normalized_cord, feature_in], dim=-1)
+                )
+            elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS:
+                feature_in = torch.cat(
+                    [self.pos_encoder(normalized_cord), feature_in], dim=-1
+                )
+            elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES:
+                # Ignore normalized_cord here to make it decoupled with coordinates
+                feature_in = torch.cat([self.pos_encoder(feature_in)], dim=-1)
+        else:
+            if (
+                self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS
+                and self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+            ):
+                feature_in = torch.cat([normalized_cord, feature_in], dim=-1)
+            elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS:
+                feature_in = normalized_cord
+            elif self.cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES:
+                feature_in = feature_in
+        net_out_s, net_out_c = self.render_net(feature_in, z, seg_map_bev_onehot)
+        return net_out_s, net_out_c
+    def _forward_global(self, net_out, z):
+        r"""Forward the CNN
+        Args:
+            net_out (N x C5 x H x W tensor): Intermediate feature maps.
+            z (N x C3 tensor): Intermediate style vectors.
+        Returns:
+            fake_images (N x 3 x H x W tensor): Output image.
+        """
+        fake_images = net_out.permute(0, 3, 1, 2).contiguous()
+        if self.denoiser is not None:
+            fake_images = self.denoiser(fake_images, z)
+            fake_images = torch.tanh(fake_images)
+        return fake_images
+class GlobalEncoder(torch.nn.Module):
+    def __init__(self, cfg):
+        super(GlobalEncoder, self).__init__()
+        n_classes = cfg.NETWORK.GANCRAFT.N_CLASSES
+        self.hf_conv = torch.nn.Conv2d(1, 8, kernel_size=3, stride=2, padding=1)
+        self.seg_conv = torch.nn.Conv2d(
+            n_classes,
+            8,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        conv_blocks = []
+        cur_hidden_channels = 16
+        for _ in range(1, cfg.NETWORK.GANCRAFT.GLOBAL_ENCODER_N_BLOCKS):
+            conv_blocks.append(
+                SRTConvBlock(in_channels=cur_hidden_channels, out_channels=None)
+            )
+            cur_hidden_channels *= 2
+        self.conv_blocks = torch.nn.Sequential(*conv_blocks)
+        self.fc1 = torch.nn.Linear(cur_hidden_channels, 16)
+        self.fc2 = torch.nn.Linear(16, cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM)
+        self.act = torch.nn.LeakyReLU(0.2)
+    def forward(self, hf_seg):
+        hf = self.act(self.hf_conv(hf_seg[:, [0]]))
+        seg = self.act(self.seg_conv(hf_seg[:, 1:]))
+        out = torch.cat([hf, seg], dim=1)
+        for layer in self.conv_blocks:
+            out = self.act(layer(out))
+        out = out.permute(0, 2, 3, 1)
+        out = torch.mean(out.reshape(out.shape[0], -1, out.shape[-1]), dim=1)
+        cond = self.act(self.fc1(out))
+        cond = torch.tanh(self.fc2(cond))
+        return cond
+class LocalEncoder(torch.nn.Module):
+    def __init__(self, cfg):
+        super(LocalEncoder, self).__init__()
+        n_classes = cfg.NETWORK.GANCRAFT.N_CLASSES
+        self.hf_conv = torch.nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3)
+        self.seg_conv = torch.nn.Conv2d(
+            n_classes, 32, kernel_size=7, stride=2, padding=3
+        )
+        if cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM == "BATCH_NORM":
+            self.bn1 = torch.nn.BatchNorm2d(64)
+        elif cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM == "GROUP_NORM":
+            self.bn1 = torch.nn.GroupNorm(32, 64)
+        else:
+            raise ValueError(
+                "Unknown normalization: %s" % cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM
+            )
+        self.conv2 = ResConvBlock(64, 128, cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM)
+        self.conv3 = ResConvBlock(128, 256, cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM)
+        self.conv4 = ResConvBlock(256, 512, cfg.NETWORK.GANCRAFT.LOCAL_ENCODER_NORM)
+        self.dconv5 = torch.nn.ConvTranspose2d(
+            512, 128, kernel_size=4, stride=2, padding=1
+        )
+        self.dconv6 = torch.nn.ConvTranspose2d(
+            128, 32, kernel_size=4, stride=2, padding=1
+        )
+        self.dconv7 = torch.nn.Conv2d(
+            32, cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM - 1, kernel_size=1
+        )
+    def forward(self, hf_seg):
+        hf = self.hf_conv(hf_seg[:, [0]])
+        seg = self.seg_conv(hf_seg[:, 1:])
+        out = F.relu(self.bn1(torch.cat([hf, seg], dim=1)), inplace=True)
+        # print(out.size())   # torch.Size([N, 64, H/2, W/2])
+        out = F.avg_pool2d(self.conv2(out), 2, stride=2)
+        # print(out.size())   # torch.Size([N, 128, H/4, W/4])
+        out = self.conv3(out)
+        # print(out.size())   # torch.Size([N, 256, H/4, W/4])
+        out = self.conv4(out)
+        # print(out.size())   # torch.Size([N, 512, H/4, W/4])
+        out = self.dconv5(out)
+        # print(out.size())   # torch.Size([N, 128, H/2, W/2])
+        out = self.dconv6(out)
+        # print(out.size())   # torch.Size([N, 32, H, W])
+        out = self.dconv7(out)
+        # print(out.size())   # torch.Size([N, OUT_DIM - 1, H, W])
+        return torch.tanh(out)
+class SinCosEncoder(torch.nn.Module):
+    def __init__(self, cfg):
+        super(SinCosEncoder, self).__init__()
+        self.freq_bands = 2.0 ** torch.linspace(
+            0,
+            cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS - 1,
+            steps=cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS,
+        )
+    def forward(self, features):
+        cord_sin = torch.cat(
+            [torch.sin(features * fb) for fb in self.freq_bands], dim=-1
+        )
+        cord_cos = torch.cat(
+            [torch.cos(features * fb) for fb in self.freq_bands], dim=-1
+        )
+        return torch.cat([cord_sin, cord_cos], dim=-1)
+class RenderMLP(torch.nn.Module):
+    r"""MLP with affine modulation."""
+    def __init__(self, cfg):
+        super(RenderMLP, self).__init__()
+        in_dim = 0
+        f_dim = (
+            cfg.NETWORK.GANCRAFT.ENCODER_OUT_DIM
+            if cfg.NETWORK.GANCRAFT.ENCODER in ["GLOBAL", "LOCAL"]
+            else 0
+        )
+        if cfg.NETWORK.GANCRAFT.POS_EMD == "HASH_GRID":
+            in_dim = (
+                cfg.NETWORK.GANCRAFT.HASH_GRID_N_LEVELS
+                * cfg.NETWORK.GANCRAFT.HASH_GRID_LEVEL_DIM
+            )
+            in_dim += (
+                f_dim
+                if cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS
+                and not cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+                else 0
+            )
+        elif cfg.NETWORK.GANCRAFT.POS_EMD == "SIN_COS":
+            if (
+                cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS
+                and cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+            ):
+                in_dim = (3 + f_dim) * cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS * 2
+            elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS:
+                in_dim = 3 * cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS * 2 + f_dim
+            elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES:
+                in_dim = f_dim * cfg.NETWORK.GANCRAFT.SIN_COS_FREQ_BENDS * 2
+        else:
+            if (
+                cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS
+                and cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES
+            ):
+                in_dim = 3 + f_dim
+            elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_CORDS:
+                in_dim = 3
+            elif cfg.NETWORK.GANCRAFT.POS_EMD_INCUDE_FEATURES:
+                in_dim = f_dim
+        self.fc_m_a = torch.nn.Linear(
+            cfg.NETWORK.GANCRAFT.N_CLASSES + 1
+            if cfg.NETWORK.GANCRAFT.BUILDING_MODE
+            else cfg.NETWORK.GANCRAFT.N_CLASSES,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            bias=False,
+        )
+        self.fc_1 = torch.nn.Linear(
+            in_dim,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+        )
+        self.fc_2 = (
+            ModLinear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                bias=False,
+                mod_bias=True,
+                output_mode=True,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            )
+        )
+        self.fc_3 = (
+            ModLinear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                bias=False,
+                mod_bias=True,
+                output_mode=True,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            )
+        )
+        self.fc_4 = (
+            ModLinear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                bias=False,
+                mod_bias=True,
+                output_mode=True,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            )
+        )
+        self.fc_sigma = (
+            torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_SIGMA,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_SIGMA,
+            )
+        )
+        self.fc_5 = (
+            ModLinear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                bias=False,
+                mod_bias=True,
+                output_mode=True,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            )
+        )
+        self.fc_6 = (
+            ModLinear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                bias=False,
+                mod_bias=True,
+                output_mode=True,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            )
+        )
+        self.fc_out_c = (
+            torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_COLOR,
+            )
+            if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None
+            else torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+                cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_COLOR,
+            )
+        )
+        self.act = torch.nn.LeakyReLU(negative_slope=0.2)
+    def forward(self, x, z, m):
+        r"""Forward network
+        Args:
+            x (N x H x W x M x in_channels tensor): Projected features.
+            z (N x cfg.NETWORK.GANCRAFT.STYLE_DIM tensor): Style codes.
+            m (N x H x W x M x mask_channels tensor): One-hot segmentation maps.
+        """
+        # b, h, w, n, _ = x.size()
+        if z is not None:
+            z = z[:, None, None, None, :]
+        f = self.fc_1(x)
+        f = f + self.fc_m_a(m)
+        # Common MLP
+        f = self.act(f)
+        f = self.act(self.fc_2(f, z)) if z is not None else self.act(self.fc_2(f))
+        f = self.act(self.fc_3(f, z)) if z is not None else self.act(self.fc_3(f))
+        f = self.act(self.fc_4(f, z)) if z is not None else self.act(self.fc_4(f))
+        # Sigma MLP
+        sigma = self.fc_sigma(f) if z is not None else self.act(self.fc_sigma(f))
+        # Color MLP
+        f = self.act(self.fc_5(f, z)) if z is not None else self.act(self.fc_5(f))
+        f = self.act(self.fc_6(f, z)) if z is not None else self.act(self.fc_6(f))
+        c = self.fc_out_c(f)
+        return sigma, c
+class RenderCNN(torch.nn.Module):
+    r"""CNN converting intermediate feature map to final image."""
+    def __init__(self, cfg):
+        super(RenderCNN, self).__init__()
+        if cfg.NETWORK.GANCRAFT.STYLE_DIM is not None:
+            self.fc_z_cond = torch.nn.Linear(
+                cfg.NETWORK.GANCRAFT.STYLE_DIM,
+                2 * 2 * cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            )
+        self.conv1 = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_OUT_DIM_COLOR,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            1,
+            stride=1,
+            padding=0,
+        )
+        self.conv2a = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            3,
+            stride=1,
+            padding=1,
+        )
+        self.conv2b = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.conv3a = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            3,
+            stride=1,
+            padding=1,
+        )
+        self.conv3b = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            3,
+            stride=1,
+            padding=1,
+            bias=False,
+        )
+        self.conv4a = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            1,
+            stride=1,
+            padding=0,
+        )
+        self.conv4b = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM,
+            1,
+            stride=1,
+            padding=0,
+        )
+        self.conv4 = torch.nn.Conv2d(
+            cfg.NETWORK.GANCRAFT.RENDER_HIDDEN_DIM, 3, 1, stride=1, padding=0
+        )
+        self.act = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
+    def modulate(self, x, w, b):
+        w = w[..., None, None]
+        b = b[..., None, None]
+        return x * (w + 1) + b
+    def forward(self, x, z):
+        r"""Forward network.
+        Args:
+            x (N x in_channels x H x W tensor): Intermediate feature map
+            z (N x style_dim tensor): Style codes.
+        """
+        if z is not None:
+            z = self.fc_z_cond(z)
+            adapt = torch.chunk(z, 2 * 2, dim=-1)
+        y = self.act(self.conv1(x))
+        y = y + self.conv2b(self.act(self.conv2a(y)))
+        if z is not None:
+            y = self.act(self.modulate(y, adapt[0], adapt[1]))
+        else:
+            y = self.act(y)
+        y = y + self.conv3b(self.act(self.conv3a(y)))
+        if z is not None:
+            y = self.act(self.modulate(y, adapt[2], adapt[3]))
+        else:
+            y = self.act(y)
+        y = y + self.conv4b(self.act(self.conv4a(y)))
+        y = self.act(y)
+        y = self.conv4(y)
+        return y
+class SRTConvBlock(torch.nn.Module):
+    def __init__(self, in_channels, hidden_channels=None, out_channels=None):
+        super(SRTConvBlock, self).__init__()
+        if hidden_channels is None:
+            hidden_channels = in_channels
+        if out_channels is None:
+            out_channels = 2 * hidden_channels
+        self.layers = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                in_channels,
+                hidden_channels,
+                stride=1,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(
+                hidden_channels,
+                out_channels,
+                stride=2,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            torch.nn.ReLU(),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class ResConvBlock(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, norm, bias=False):
+        super(ResConvBlock, self).__init__()
+        # conv3x3(in_planes, int(out_planes / 2))
+        self.conv1 = torch.nn.Conv2d(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias,
+        )
+        # conv3x3(int(out_planes / 2), int(out_planes / 4))
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2,
+            out_channels // 4,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias,
+        )
+        # conv3x3(int(out_planes / 4), int(out_planes / 4))
+        self.conv3 = torch.nn.Conv2d(
+            out_channels // 4,
+            out_channels // 4,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=bias,
+        )
+        if norm == "BATCH_NORM":
+            self.bn1 = torch.nn.BatchNorm2d(in_channels)
+            self.bn2 = torch.nn.BatchNorm2d(out_channels // 2)
+            self.bn3 = torch.nn.BatchNorm2d(out_channels // 4)
+            self.bn4 = torch.nn.BatchNorm2d(in_channels)
+        elif norm == "GROUP_NORM":
+            self.bn1 = torch.nn.GroupNorm(32, in_channels)
+            self.bn2 = torch.nn.GroupNorm(32, out_channels // 2)
+            self.bn3 = torch.nn.GroupNorm(32, out_channels // 4)
+            self.bn4 = torch.nn.GroupNorm(32, in_channels)
+        if in_channels != out_channels:
+            self.downsample = torch.nn.Sequential(
+                self.bn4,
+                torch.nn.ReLU(True),
+                torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, bias=False
+                ),
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        residual = x
+        # print(residual.size())      # torch.Size([N, 64, H, W])
+        out1 = self.bn1(x)
+        out1 = F.relu(out1, True)
+        out1 = self.conv1(out1)
+        # print(out1.size())          # torch.Size([N, 64, H, W])
+        out2 = self.bn2(out1)
+        out2 = F.relu(out2, True)
+        out2 = self.conv2(out2)
+        # print(out2.size())          # torch.Size([N, 32, H, W])
+        out3 = self.bn3(out2)
+        out3 = F.relu(out3, True)
+        out3 = self.conv3(out3)
+        # print(out3.size())          # torch.Size([N, 32, H, W])
+        out3 = torch.cat((out1, out2, out3), dim=1)
+        # print(out3.size())          # torch.Size([N, 128, H, W])
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+            # print(residual.size())  # torch.Size([N, 128, H, W])
+        out3 += residual
+        return out3
+class ModLinear(torch.nn.Module):
+    r"""Linear layer with affine modulation (Based on StyleGAN2 mod demod).
+    Equivalent to affine modulation following linear, but faster when the same modulation parameters are shared across
+    multiple inputs.
+    Args:
+        in_features (int): Number of input features.
+        out_features (int): Number of output features.
+        style_features (int): Number of style features.
+        bias (bool): Apply additive bias before the activation function?
+        mod_bias (bool): Whether to modulate bias.
+        output_mode (bool): If True, modulate output instead of input.
+        weight_gain (float): Initialization gain
+    """
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        style_features,
+        bias=True,
+        mod_bias=True,
+        output_mode=False,
+        weight_gain=1,
+        bias_init=0,
+    ):
+        super(ModLinear, self).__init__()
+        weight_gain = weight_gain / np.sqrt(in_features)
+        self.weight = torch.nn.Parameter(
+            torch.randn([out_features, in_features]) * weight_gain
+        )
+        self.bias = (
+            torch.nn.Parameter(torch.full([out_features], np.float32(bias_init)))
+            if bias
+            else None
+        )
+        self.weight_alpha = torch.nn.Parameter(
+            torch.randn([in_features, style_features]) / np.sqrt(style_features)
+        )
+        self.bias_alpha = torch.nn.Parameter(
+            torch.full([in_features], 1, dtype=torch.float)
+        )  # init to 1
+        self.weight_beta = None
+        self.bias_beta = None
+        self.mod_bias = mod_bias
+        self.output_mode = output_mode
+        if mod_bias:
+            if output_mode:
+                mod_bias_dims = out_features
+            else:
+                mod_bias_dims = in_features
+            self.weight_beta = torch.nn.Parameter(
+                torch.randn([mod_bias_dims, style_features]) / np.sqrt(style_features)
+            )
+            self.bias_beta = torch.nn.Parameter(
+                torch.full([mod_bias_dims], 0, dtype=torch.float)
+            )
+    @staticmethod
+    def _linear_f(x, w, b):
+        w = w.to(x.dtype)
+        x_shape = x.shape
+        x = x.reshape(-1, x_shape[-1])
+        if b is not None:
+            b = b.to(x.dtype)
+            x = torch.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+        x = x.reshape(*x_shape[:-1], -1)
+        return x
+    # x: B, ...   , Cin
+    # z: B, 1, 1, , Cz
+    def forward(self, x, z):
+        x_shape = x.shape
+        z_shape = z.shape
+        x = x.reshape(x_shape[0], -1, x_shape[-1])
+        z = z.reshape(z_shape[0], 1, z_shape[-1])
+        alpha = self._linear_f(z, self.weight_alpha, self.bias_alpha)  # [B, ..., I]
+        w = self.weight.to(x.dtype)  # [O I]
+        w = w.unsqueeze(0) * alpha  # [1 O I] * [B 1 I] = [B O I]
+        if self.mod_bias:
+            beta = self._linear_f(z, self.weight_beta, self.bias_beta)  # [B, ..., I]
+            if not self.output_mode:
+                x = x + beta
+        b = self.bias
+        if b is not None:
+            b = b.to(x.dtype)[None, None, :]
+        if self.mod_bias and self.output_mode:
+            if b is None:
+                b = beta
+            else:
+                b = b + beta
+        # [B ? I] @ [B I O] = [B ? O]
+        if b is not None:
+            x = torch.baddbmm(b, x, w.transpose(1, 2))
+        else:
+            x = x.bmm(w.transpose(1, 2))
+        x = x.reshape(*x_shape[:-1], x.shape[-1])
+        return x
+class GanCraftDiscriminator(torch.nn.Module):
+    def __init__(self, cfg):
+        super(GanCraftDiscriminator, self).__init__()
+        # bottom-up pathway
+        # down_conv2d_block = Conv2dBlock, stride=2, kernel=3, padding=1, weight_norm=spectral
+        # self.enc1 = down_conv2d_block(num_input_channels, num_filters)  # 3
+        self.enc1 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    3,  # RGB
+                    cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=2,
+                    kernel_size=3,
+                    padding=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.enc2 = down_conv2d_block(1 * num_filters, 2 * num_filters)  # 7
+        self.enc2 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    1 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=2,
+                    kernel_size=3,
+                    padding=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.enc3 = down_conv2d_block(2 * num_filters, 4 * num_filters)  # 15
+        self.enc3 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=2,
+                    kernel_size=3,
+                    padding=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.enc4 = down_conv2d_block(4 * num_filters, 8 * num_filters)  # 31
+        self.enc4 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=2,
+                    kernel_size=3,
+                    padding=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.enc5 = down_conv2d_block(8 * num_filters, 8 * num_filters)  # 63
+        self.enc5 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=2,
+                    kernel_size=3,
+                    padding=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # top-down pathway
+        # latent_conv2d_block = Conv2dBlock, stride=1, kernel=1, weight_norm=spectral
+        # self.lat2 = latent_conv2d_block(2 * num_filters, 4 * num_filters)
+        self.lat2 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=1,
+                    kernel_size=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.lat3 = latent_conv2d_block(4 * num_filters, 4 * num_filters)
+        self.lat3 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=1,
+                    kernel_size=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.lat4 = latent_conv2d_block(8 * num_filters, 4 * num_filters)
+        self.lat4 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=1,
+                    kernel_size=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.lat5 = latent_conv2d_block(8 * num_filters, 4 * num_filters)
+        self.lat5 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    8 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=1,
+                    kernel_size=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # upsampling
+        self.upsample2x = torch.nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=False
+        )
+        # final layers
+        # stride1_conv2d_block = Conv2dBlock, stride=1, kernel=3, padding=1, weight_norm=spectral
+        # self.final2 = stride1_conv2d_block(4 * num_filters, 2 * num_filters)
+        self.final2 = torch.nn.Sequential(
+            torch.nn.utils.spectral_norm(
+                torch.nn.Conv2d(
+                    4 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                    stride=1,
+                    kernel_size=3,
+                    padding=1,
+                    bias=True,
+                )
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        # self.output = Conv2dBlock(num_filters * 2, num_labels + 1, kernel_size=1)
+        self.output = torch.nn.Sequential(
+            torch.nn.Conv2d(
+                2 * cfg.NETWORK.GANCRAFT.DIS_N_CHANNEL_BASE,
+                cfg.NETWORK.GANCRAFT.N_CLASSES + 1,
+                stride=1,
+                kernel_size=1,
+                bias=True,
+            ),
+            torch.nn.LeakyReLU(0.2),
+        )
+        self.interpolator = self._smooth_interp
+    @staticmethod
+    def _smooth_interp(x, size):
+        r"""Smooth interpolation of segmentation maps.
+        Args:
+            x (4D tensor): Segmentation maps.
+            size(2D list): Target size (H, W).
+        """
+        x = F.interpolate(x, size=size, mode="area")
+        onehot_idx = torch.argmax(x, dim=-3, keepdims=True)
+        x.fill_(0.0)
+        x.scatter_(1, onehot_idx, 1.0)
+        return x
+    def _single_forward(self, images, seg_maps):
+        # bottom-up pathway
+        feat11 = self.enc1(images)
+        feat12 = self.enc2(feat11)
+        feat13 = self.enc3(feat12)
+        feat14 = self.enc4(feat13)
+        feat15 = self.enc5(feat14)
+        # top-down pathway and lateral connections
+        feat25 = self.lat5(feat15)
+        feat24 = self.upsample2x(feat25) + self.lat4(feat14)
+        feat23 = self.upsample2x(feat24) + self.lat3(feat13)
+        feat22 = self.upsample2x(feat23) + self.lat2(feat12)
+        # final prediction layers
+        feat32 = self.final2(feat22)
+        label_map = self.interpolator(seg_maps, size=feat32.size()[2:])
+        pred = self.output(feat32)  # N, num_labels + 1, H//4, W//4
+        return {"pred": pred, "label": label_map}
+    def forward(self, images, seg_maps, masks):
+        # print(seg_maps.size())  # torch.Size([1, 7, H, W])
+        # print(masks.size())  # torch.Size([1, 1, H, W])
+        seg_maps = seg_maps * masks
+        return self._single_forward(images * masks, seg_maps)

requirements.txt CHANGED Viewed

@@ -2,6 +2,9 @@
 torch==1.12.0
 torchvision
 numpy
 opencv-python
-gradio

 torch==1.12.0
 torchvision
+easydict
+gradio
 numpy
 opencv-python
+pillow