Spaces:
Build error
Build error
Merged Dockerfile with robust build environment for transformer-engine compilation
Browse files- Dockerfile +35 -31
Dockerfile
CHANGED
|
@@ -1,23 +1,31 @@
|
|
| 1 |
-
#
|
| 2 |
-
FROM nvidia/cuda:12.4.
|
| 3 |
|
| 4 |
# Set environment variables for non-interactive installations to prevent prompts during apt-get.
|
| 5 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
ENV CONDA_DIR=/opt/conda
|
| 7 |
-
ENV PATH=$CONDA_DIR/bin:$PATH
|
| 8 |
|
| 9 |
WORKDIR /app
|
| 10 |
|
| 11 |
-
# Install essential system dependencies
|
| 12 |
-
RUN apt-get update && apt-get install -
|
| 13 |
wget \
|
| 14 |
git \
|
| 15 |
build-essential \
|
| 16 |
libgl1-mesa-glx \
|
| 17 |
libglib2.0-0 \
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
# Install Miniconda
|
| 21 |
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
|
| 22 |
/bin/bash miniconda.sh -b -p $CONDA_DIR && \
|
| 23 |
rm miniconda.sh && \
|
|
@@ -36,7 +44,7 @@ COPY . /app
|
|
| 36 |
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
|
| 37 |
RUN conda env create -f cosmos-predict1.yaml
|
| 38 |
|
| 39 |
-
# Set the default Conda environment to be activated
|
| 40 |
ENV CONDA_DEFAULT_ENV=cosmos-predict1
|
| 41 |
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
|
| 42 |
|
|
@@ -49,35 +57,31 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
| 49 |
torchaudio==2.3.1 \
|
| 50 |
--index-url https://download.pytorch.org/whl/cu121
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
#
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
# Install Transformer Engine
|
| 57 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 58 |
conda activate cosmos-predict1 && \
|
| 59 |
-
pip install --no-cache-dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
# Make the start.sh script executable.
|
| 62 |
-
# THIS IS A STANDALONE RUN COMMAND.
|
| 63 |
RUN chmod +x /app/start.sh
|
| 64 |
|
| 65 |
-
# --- Verification Steps ---
|
| 66 |
-
RUN echo "Verifying Python and Conda installations..."
|
| 67 |
-
RUN python --version
|
| 68 |
-
RUN conda env list
|
| 69 |
-
RUN echo "Verifying PyTorch and CUDA availability..."
|
| 70 |
-
RUN conda run -n cosmos-predict1 python <<EOF
|
| 71 |
-
import torch
|
| 72 |
-
print('PyTorch Version: ' + torch.__version__)
|
| 73 |
-
print('CUDA Available: ' + str(torch.cuda.is_available()))
|
| 74 |
-
if torch.cuda.is_available():
|
| 75 |
-
print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
|
| 76 |
-
else:
|
| 77 |
-
print('CUDA Device Name: N/A')
|
| 78 |
-
EOF
|
| 79 |
-
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
|
| 80 |
-
# --- End Verification Steps ---
|
| 81 |
-
|
| 82 |
# Set the default command to run when the container starts.
|
| 83 |
CMD ["/app/start.sh"]
|
|
|
|
| 1 |
+
# Adopt new base image with cuDNN pre-installed
|
| 2 |
+
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
|
| 3 |
|
| 4 |
# Set environment variables for non-interactive installations to prevent prompts during apt-get.
|
| 5 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
ENV CONDA_DIR=/opt/conda
|
|
|
|
| 7 |
|
| 8 |
WORKDIR /app
|
| 9 |
|
| 10 |
+
# Install essential system dependencies from both Dockerfiles
|
| 11 |
+
RUN apt-get update -y && apt-get install -qqy \
|
| 12 |
wget \
|
| 13 |
git \
|
| 14 |
build-essential \
|
| 15 |
libgl1-mesa-glx \
|
| 16 |
libglib2.0-0 \
|
| 17 |
+
rsync \
|
| 18 |
+
make \
|
| 19 |
+
libssl-dev zlib1g-dev \
|
| 20 |
+
libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
|
| 21 |
+
libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
|
| 22 |
+
ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
|
| 23 |
+
# Ensure git-lfs is installed and initialized
|
| 24 |
+
git-lfs \
|
| 25 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 26 |
+
&& git lfs install # Initialize LFS system-wide
|
| 27 |
|
| 28 |
+
# Install Miniconda (retain our existing approach)
|
| 29 |
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
|
| 30 |
/bin/bash miniconda.sh -b -p $CONDA_DIR && \
|
| 31 |
rm miniconda.sh && \
|
|
|
|
| 44 |
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
|
| 45 |
RUN conda env create -f cosmos-predict1.yaml
|
| 46 |
|
| 47 |
+
# Set the default Conda environment to be activated and update PATH
|
| 48 |
ENV CONDA_DEFAULT_ENV=cosmos-predict1
|
| 49 |
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
|
| 50 |
|
|
|
|
| 57 |
torchaudio==2.3.1 \
|
| 58 |
--index-url https://download.pytorch.org/whl/cu121
|
| 59 |
|
| 60 |
+
# IMPORTANT: Symlink fix for Transformer Engine compilation.
|
| 61 |
+
# The `nvidia/cuda` base images place CUDA libraries and headers in /usr/local/cuda.
|
| 62 |
+
# We need to ensure that the build system can find cuDNN headers.
|
| 63 |
+
ENV CONDA_PREFIX_FIX=/usr/local/cuda
|
| 64 |
+
RUN ln -sf $CONDA_PREFIX_FIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX_FIX/include/ || true && \
|
| 65 |
+
ln -sf $CONDA_PREFIX_FIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX_FIX/include/python3.10 || true
|
| 66 |
|
| 67 |
+
# Install Transformer Engine by attempting to compile it, relying on the robust build environment.
|
| 68 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 69 |
conda activate cosmos-predict1 && \
|
| 70 |
+
pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
|
| 71 |
+
|
| 72 |
+
# Install Apex for inference.
|
| 73 |
+
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 74 |
+
conda activate cosmos-predict1 && \
|
| 75 |
+
git clone https://github.com/NVIDIA/apex /app/apex && \
|
| 76 |
+
CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex
|
| 77 |
+
|
| 78 |
+
# Install MoGe for inference.
|
| 79 |
+
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 80 |
+
conda activate cosmos-predict1 && \
|
| 81 |
+
pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git
|
| 82 |
|
| 83 |
# Make the start.sh script executable.
|
|
|
|
| 84 |
RUN chmod +x /app/start.sh
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
# Set the default command to run when the container starts.
|
| 87 |
CMD ["/app/start.sh"]
|