Skip to content

Commit ef69c77

Browse files
committed
Create Atari directory so loading doesn't fail
1 parent 1c419e3 commit ef69c77

File tree

3 files changed

+92
-2
lines changed

3 files changed

+92
-2
lines changed

k8s/devbox.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@ spec:
3737
sudo mkdir -p "/training/cleanba"
3838
sudo chown dev:dev "/training/cleanba"
3939
git clone https://github.com/google-deepmind/boxoban-levels "/opt/sokoban_cache/boxoban-levels-master"
40+
mkdir -p /opt/venv/lib/python3.12/site-packages/envpool/atari/roms # Hack to work around botched image
4041
git pull
4142
git checkout {COMMIT_HASH}
4243
git submodule update --recursive
43-
pip install matplotlib jupyter
44-
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
44+
uv pip install matplotlib jupyter
45+
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
4546
sleep 1d
4647
resources:
4748
requests:

k8s/runner-no-sokoban.yaml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
apiVersion: batch/v1
2+
kind: Job
3+
metadata:
4+
name: {NAME}
5+
labels:
6+
kueue.x-k8s.io/queue-name: farai
7+
wandb-group: {WANDB_RUN_GROUP}
8+
wandb-project: {WANDB_PROJECT}
9+
wandb-job-name: {WANDB_JOB_NAME}
10+
wandb-entity: {WANDB_ENTITY}
11+
launch-id: {LAUNCH_ID}
12+
spec:
13+
suspend: true
14+
backoffLimit: 10 # How many times to try to run the job until giving up
15+
template:
16+
metadata:
17+
generateName: {NAME}
18+
spec:
19+
securityContext:
20+
runAsUser: 1001
21+
runAsGroup: 1001
22+
priorityClassName: {PRIORITY}
23+
volumes:
24+
- name: training
25+
persistentVolumeClaim:
26+
claimName: vast-learned-planners
27+
- name: dshm
28+
emptyDir:
29+
medium: Memory
30+
sizeLimit: "{SHM_SIZE}"
31+
containers:
32+
- name: devbox-container
33+
image: "ghcr.io/alignmentresearch/train-learned-planner:{CONTAINER_TAG}"
34+
imagePullPolicy: Always
35+
command:
36+
- bash
37+
- -c
38+
- |
39+
sudo mkdir -p "/training/cleanba"
40+
sudo chown dev:dev "/training/cleanba"
41+
mkdir -p /opt/venv/lib/python3.12/site-packages/envpool/atari/roms # Hack to work around botched image
42+
git pull
43+
git checkout {COMMIT_HASH}
44+
git submodule update --recursive
45+
{COMMAND}
46+
resources:
47+
requests:
48+
cpu: {CPU}
49+
limits:
50+
memory: {MEMORY}
51+
nvidia.com/gpu: {GPU}
52+
env:
53+
- name: OMP_NUM_THREADS
54+
value: {OMP_NUM_THREADS}
55+
- name: XLA_PYTHON_CLIENT_MEM_FRACTION
56+
value: {XLA_PYTHON_CLIENT_MEM_FRACTION}
57+
- name: GIT_ASKPASS
58+
value: "true"
59+
- name: GITHUB_PAT
60+
valueFrom:
61+
secretKeyRef:
62+
name: github-credentials
63+
key: pat
64+
- name: GIT_CONFIG_PARAMETERS
65+
value: "'credential.https://github.202132.xyz.username=$(GITHUB_PAT)'"
66+
- name: WANDB_API_KEY
67+
valueFrom:
68+
secretKeyRef:
69+
name: wandb
70+
key: api-key
71+
- name: WANDB_ENTITY
72+
value: {WANDB_ENTITY}
73+
- name: WANDB_JOB_NAME
74+
value: {WANDB_JOB_NAME}
75+
- name: WANDB_PROJECT
76+
value: {WANDB_PROJECT}
77+
- name: WANDB_RUN_GROUP
78+
value: {WANDB_RUN_GROUP}
79+
- name: WANDB_MODE
80+
value: {WANDB_MODE}
81+
volumeMounts:
82+
- name: dshm
83+
mountPath: /dev/shm
84+
- name: training
85+
mountPath: {TRAINING_MOUNT}
86+
restartPolicy: Never
87+
imagePullSecrets:
88+
- name: docker

k8s/runner.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ spec:
4141
sudo mkdir -p "/training/cleanba"
4242
sudo chown dev:dev "/training/cleanba"
4343
git clone https://github.com/google-deepmind/boxoban-levels "/opt/sokoban_cache/boxoban-levels-master"
44+
mkdir -p /opt/venv/lib/python3.12/site-packages/envpool/atari/roms # Hack to work around botched image
4445
git pull
4546
git checkout {COMMIT_HASH}
4647
git submodule update --recursive

0 commit comments

Comments
 (0)