Fix MAGMA PyTorch issue with GPU (#1154)

rosbo · web-flow · commit 2ac680f0f9cf · 2022-05-10T09:09:27.000-07:00
- Add test to prevent regression.

http://b/231736279
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -12,8 +12,10 @@ ARG TORCHVISION_VERSION
 FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
 FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
 FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
-ENV CUDA_MAJOR_VERSION=11
-ENV CUDA_MINOR_VERSION=0
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
 # NVIDIA binaries from the host are mounted to /opt/bin.
 ENV PATH=/opt/bin:${PATH}
 # Add CUDA stubs to LD_LIBRARY_PATH to support building the GPU image on a CPU machine.
@@ -99,7 +101,8 @@ RUN conda install implicit && \
 # Install PyTorch
 {{ if eq .Accelerator "gpu" }}
 COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
-RUN pip install /tmp/torch/*.whl && \
+RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \
+    pip install /tmp/torch/*.whl && \
     rm -rf /tmp/torch && \
     /tmp/clean-layer.sh
 {{ else }}
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -37,6 +37,8 @@ pipeline {
                 --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
                 --build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \
                 --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
+                --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
+                --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
                 --push
             '''
           }
diff --git a/config.txt b/config.txt
@@ -7,3 +7,5 @@ TORCH_VERSION=1.9.1
 TORCHAUDIO_VERSION=0.9.1
 TORCHTEXT_VERSION=0.10.1
 TORCHVISION_VERSION=0.10.1
+CUDA_MAJOR_VERSION=11
+CUDA_MINOR_VERSION=0
diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile
@@ -6,12 +6,15 @@ ARG PACKAGE_VERSION
 ARG TORCHAUDIO_VERSION
 ARG TORCHTEXT_VERSION
 ARG TORCHVISION_VERSION
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
 
 # TORCHVISION_VERSION is mandatory
 RUN test -n "$TORCHVISION_VERSION"
 
 # Build instructions: https://github.com/pytorch/pytorch#from-source
 RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools==59.5.0 cmake cffi typing_extensions future six requests dataclasses
+RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}
 
 # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash.
 # This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000
diff --git a/tests/test_pytorch.py b/tests/test_pytorch.py
@@ -15,6 +15,14 @@ def test_nn(self):
         data_torch = autograd.Variable(torch.randn(2, 5))
         linear_torch(data_torch)
 
+    @gpu_test
+    def test_linalg(self):
+        A = torch.randn(3, 3).t().to('cuda')
+        B = torch.randn(3).t().to('cuda')
+
+        result = torch.linalg.solve(A, B)
+        self.assertEqual(3, result.shape[0])
+
     @gpu_test
     def test_gpu_computation(self):
         cuda = torch.device('cuda')  

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,8 @@ pipeline {`
`37`	`37`	`--build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \`
`38`	`38`	`--build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \`
`39`	`39`	`--build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \`
	`40`	`+ --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \`
	`41`	`+ --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \`
`40`	`42`	`--push`
`41`	`43`	`'''`
`42`	`44`	`}`