|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FROM nvidia/cuda:11.1.1-devel-ubuntu20.04 |
|
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive |
|
|
|
|
|
|
|
|
RUN apt-get update -y && \ |
|
|
apt-get install -y \ |
|
|
git python3 python3-dev libpython3-dev python3-pip sudo pdsh \ |
|
|
htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \ |
|
|
nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \ |
|
|
rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \ |
|
|
rdmacm-utils perftest rdma-core nano && \ |
|
|
update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ |
|
|
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ |
|
|
pip install --upgrade pip && \ |
|
|
pip install gpustat |
|
|
|
|
|
|
|
|
|
|
|
ENV PASSWORD=password |
|
|
RUN mkdir /var/run/sshd && \ |
|
|
echo "root:${PASSWORD}" | chpasswd && \ |
|
|
|
|
|
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ |
|
|
|
|
|
sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \ |
|
|
echo 'AuthorizedKeysFile .ssh/authorized_keys' >> /etc/ssh/sshd_config && \ |
|
|
echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \ |
|
|
|
|
|
echo "Set disable_coredump false" >> /etc/sudo.conf |
|
|
|
|
|
|
|
|
EXPOSE 22 |
|
|
|
|
|
|
|
|
ENV OPENMPI_BASEVERSION=4.1 |
|
|
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.0 |
|
|
RUN mkdir -p /build && \ |
|
|
cd /build && \ |
|
|
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ |
|
|
cd openmpi-${OPENMPI_VERSION} && \ |
|
|
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ |
|
|
make -j"$(nproc)" install && \ |
|
|
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ |
|
|
|
|
|
test -f /usr/local/mpi/bin/mpic++ && \ |
|
|
cd ~ && \ |
|
|
rm -rf /build |
|
|
|
|
|
|
|
|
ENV PATH=/usr/local/mpi/bin:${PATH} \ |
|
|
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} |
|
|
|
|
|
|
|
|
RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ |
|
|
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ |
|
|
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ |
|
|
chmod a+x /usr/local/mpi/bin/mpirun |
|
|
|
|
|
|
|
|
RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \ |
|
|
usermod -aG sudo mchorse && \ |
|
|
echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers |
|
|
|
|
|
|
|
|
RUN mkdir -p /home/mchorse/.ssh /job && \ |
|
|
echo 'Host *' > /home/mchorse/.ssh/config && \ |
|
|
echo ' StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \ |
|
|
echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \ |
|
|
echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \ |
|
|
echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \ |
|
|
echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc |
|
|
|
|
|
|
|
|
RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge |
|
|
COPY requirements/requirements.txt . |
|
|
COPY requirements/requirements-onebitadam.txt . |
|
|
COPY requirements/requirements-sparseattention.txt . |
|
|
RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \ |
|
|
pip install -r requirements-sparseattention.txt && \ |
|
|
pip install protobuf==3.20.* && \ |
|
|
pip cache purge |
|
|
|
|
|
|
|
|
RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597 |
|
|
|
|
|
COPY megatron/ megatron |
|
|
RUN python megatron/fused_kernels/setup.py install |
|
|
|
|
|
|
|
|
RUN mkdir -p /tmp && chmod 0777 /tmp |
|
|
|
|
|
|
|
|
USER mchorse |
|
|
WORKDIR /home/mchorse |
|
|
|