Skip to content

Commit 65229f1

Browse files
committed
feat: add PyTorch development environment configuration using Docker Compose
1 parent 2fd57e2 commit 65229f1

File tree

3 files changed

+40
-10
lines changed

3 files changed

+40
-10
lines changed

docker/pytorch_dev/Dockerfile

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,34 @@ RUN apt-get update && \
1818
RUN mkdir -p /var/run/sshd
1919

2020
# 设置 root 用户密码 (使用你配置的密码)
21-
RUN echo 'root:rt.1qwe2iop' | chpasswd
21+
# RUN echo 'root:rt.1qwe2iop' | chpasswd
2222

2323
# 修改 SSH 配置文件,允许 root 登录和密码认证
24-
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
25-
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
24+
# RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
25+
# sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
26+
27+
ARG YOUR_NAME=allgoo
28+
29+
# 接收 compose 传来的参数,默认值为 1000
30+
ARG HOST_UID=1000
31+
ARG HOST_GID=1000
32+
33+
# 创建一个和宿主机 UID/GID 一致的用户, 如果有默认用户占用了该 ID,先将其删除,然后再创建我们的目标用户
34+
RUN userdel -r $(getent passwd ${HOST_UID} | cut -d: -f1) 2>/dev/null || true && \
35+
groupdel $(getent group ${HOST_GID} | cut -d: -f1) 2>/dev/null || true && \
36+
groupadd -g ${HOST_GID} ${YOUR_NAME} && \
37+
useradd -u ${HOST_UID} -g ${YOUR_NAME} -m ${YOUR_NAME} -s /bin/bash
38+
39+
# 配置用户 .ssh
40+
RUN mkdir -p /home/${YOUR_NAME}/.ssh && \
41+
chown -R ${HOST_UID}:${HOST_GID} /home/${YOUR_NAME}/.ssh && \
42+
chmod 700 /home/${YOUR_NAME}/.ssh
2643

2744
# 设置工作目录
28-
WORKDIR /workspace
45+
WORKDIR /home/${YOUR_NAME}/workspace
2946

3047
# 暴露 22 端口
3148
EXPOSE 22
3249

3350
# 启动 sshd 服务,并以前台模式 (-D) 运行,这能保持容器一直处于运行状态
34-
CMD ["/usr/sbin/sshd", "-D"]
51+
CMD ["/usr/sbin/sshd", "-D"]

docker/pytorch_dev/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@
44

55
## 构建与启动
66

7+
虽然 Linux 终端里有 $UID 这个变量,但 docker compose 执行时并不一定会默认读取到它。最稳妥的做法是在 docker-compose.yml 同级目录下建一个 .env 文件,把你的 UID 和 GID 写进去:
8+
9+
在宿主机终端执行:
10+
11+
```bash
12+
echo "UID=$(id -u)" >> .env
13+
echo "GID=$(id -g)" >> .env
14+
```
15+
716
运行以下命令构建并启动容器:
817

918
```bash
Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
services:
22
pytorch_dev:
33
# 告诉 Compose 根据当前目录(.)的 Dockerfile 来构建镜像
4-
build: .
4+
build:
5+
context: .
6+
args:
7+
- HOST_UID=${UID:-1000}
8+
- HOST_GID=${GID:-1000}
59
# 构建出来的镜像会被自动命名为这个名字
6-
image: pro_pytorch:latest
10+
image: pro_pytorch:latest
711
container_name: pro1
812
# 强烈建议:vLLM 和 PyTorch 多进程需要直接共享宿主机的 IPC 命名空间,防止 NCCL 报错或卡死
913
ipc: host
@@ -13,16 +17,16 @@ services:
1317
- "8000:8000" # 映射 vLLM 默认的服务端口,方便外部调用 API
1418
volumes:
1519
# 将宿主机的代码目录挂载到容器内的 /workspace
16-
- ~/Codespace:/workspace
20+
- ~/Codespace:/home/allgoo/workspace
1721
# 建议也将本地系统的时间挂载进去,保持时间同步
1822
- /etc/localtime:/etc/localtime:ro
1923
# 挂载本机公钥,便于免密登陆
20-
- ~/.ssh/id_ed25519.pub:/root/.ssh/authorized_keys:ro
24+
- ~/.ssh/id_ed25519.pub:/home/allgoo/.ssh/authorized_keys:ro
2125
deploy:
2226
resources:
2327
reservations:
2428
devices:
2529
- driver: nvidia
2630
count: all
2731
capabilities: [gpu] # 核心配置:透传 GPU 给容器
28-
# shm_size: '16gb' # 增加共享内存,防止 PyTorch DataLoader 报错 (根据你的物理内存可适当调大)
32+
# shm_size: '16gb' # 增加共享内存,防止 PyTorch DataLoader 报错 (根据你的物理内存可适当调大)

0 commit comments

Comments
 (0)