|
| 1 | +apiVersion: batch/v1 |
| 2 | +kind: Job |
| 3 | +metadata: |
| 4 | + name: {NAME} |
| 5 | + labels: |
| 6 | + kueue.x-k8s.io/queue-name: farai |
| 7 | + wandb-group: {WANDB_RUN_GROUP} |
| 8 | + wandb-project: {WANDB_PROJECT} |
| 9 | + wandb-job-name: {WANDB_JOB_NAME} |
| 10 | + wandb-entity: {WANDB_ENTITY} |
| 11 | + launch-id: {LAUNCH_ID} |
| 12 | +spec: |
| 13 | + suspend: true |
| 14 | + backoffLimit: 10 # How many times to try to run the job until giving up |
| 15 | + template: |
| 16 | + metadata: |
| 17 | + generateName: {NAME} |
| 18 | + spec: |
| 19 | + securityContext: |
| 20 | + runAsUser: 1001 |
| 21 | + runAsGroup: 1001 |
| 22 | + priorityClassName: {PRIORITY} |
| 23 | + volumes: |
| 24 | + - name: training |
| 25 | + persistentVolumeClaim: |
| 26 | + claimName: vast-learned-planners |
| 27 | + - name: dshm |
| 28 | + emptyDir: |
| 29 | + medium: Memory |
| 30 | + sizeLimit: "{SHM_SIZE}" |
| 31 | + containers: |
| 32 | + - name: devbox-container |
| 33 | + image: "ghcr.io/alignmentresearch/train-learned-planner:{CONTAINER_TAG}" |
| 34 | + imagePullPolicy: Always |
| 35 | + command: |
| 36 | + - bash |
| 37 | + - -c |
| 38 | + - | |
| 39 | + sudo mkdir -p "/training/cleanba" |
| 40 | + sudo chown dev:dev "/training/cleanba" |
| 41 | + mkdir -p /opt/venv/lib/python3.12/site-packages/envpool/atari/roms # Hack to work around botched image |
| 42 | + git pull |
| 43 | + git checkout {COMMIT_HASH} |
| 44 | + git submodule update --recursive |
| 45 | + {COMMAND} |
| 46 | + resources: |
| 47 | + requests: |
| 48 | + cpu: {CPU} |
| 49 | + limits: |
| 50 | + memory: {MEMORY} |
| 51 | + nvidia.com/gpu: {GPU} |
| 52 | + env: |
| 53 | + - name: OMP_NUM_THREADS |
| 54 | + value: {OMP_NUM_THREADS} |
| 55 | + - name: XLA_PYTHON_CLIENT_MEM_FRACTION |
| 56 | + value: {XLA_PYTHON_CLIENT_MEM_FRACTION} |
| 57 | + - name: GIT_ASKPASS |
| 58 | + value: "true" |
| 59 | + - name: GITHUB_PAT |
| 60 | + valueFrom: |
| 61 | + secretKeyRef: |
| 62 | + name: github-credentials |
| 63 | + key: pat |
| 64 | + - name: GIT_CONFIG_PARAMETERS |
| 65 | + value: "'credential.https://github.202132.xyz.username=$(GITHUB_PAT)'" |
| 66 | + - name: WANDB_API_KEY |
| 67 | + valueFrom: |
| 68 | + secretKeyRef: |
| 69 | + name: wandb |
| 70 | + key: api-key |
| 71 | + - name: WANDB_ENTITY |
| 72 | + value: {WANDB_ENTITY} |
| 73 | + - name: WANDB_JOB_NAME |
| 74 | + value: {WANDB_JOB_NAME} |
| 75 | + - name: WANDB_PROJECT |
| 76 | + value: {WANDB_PROJECT} |
| 77 | + - name: WANDB_RUN_GROUP |
| 78 | + value: {WANDB_RUN_GROUP} |
| 79 | + - name: WANDB_MODE |
| 80 | + value: {WANDB_MODE} |
| 81 | + volumeMounts: |
| 82 | + - name: dshm |
| 83 | + mountPath: /dev/shm |
| 84 | + - name: training |
| 85 | + mountPath: {TRAINING_MOUNT} |
| 86 | + restartPolicy: Never |
| 87 | + imagePullSecrets: |
| 88 | + - name: docker |
0 commit comments