Skip to content

Commit 19dd229

Browse files
committed
More debug
1 parent 981f2d8 commit 19dd229

File tree

1 file changed

+29
-32
lines changed

1 file changed

+29
-32
lines changed

.github/workflows/ci-pytorch-test-tpu.yml

Lines changed: 29 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -78,43 +78,40 @@ jobs:
7878

7979
- name: Deploy cluster
8080
run: |
81-
export PATH=$PATH:$HOME/go/bin
82-
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -)
83-
job_name=${job_name#job.batch/}
84-
job_name=${job_name% created}
85-
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
86-
echo "GKE pod name: $pod_name"
87-
echo "Waiting on kubernetes job: $job_name"
88-
status_code=2 &&
89-
# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
90-
printf "Waiting for job to finish: "
91-
while true; do
92-
if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then
93-
status_code=1 && break;
94-
elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then
95-
status_code=0 && break;
96-
else
97-
printf ".";
98-
fi;
99-
sleep 5;
100-
done
101-
echo "Done waiting. Job status code: $status_code"
102-
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
103-
cat /tmp/full_output.txt
104-
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then
105-
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/';
106-
else
107-
mv /tmp/full_output.txt xx00;
108-
fi
109-
# First portion is the test logs.
110-
cat xx00 && echo "Done with log retrieval attempt."
111-
exit $status_code
81+
export PATH=$PATH:$HOME/go/bin
82+
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -)
83+
job_name=${job_name#job.batch/}
84+
job_name=${job_name% created}
85+
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
86+
echo "GKE pod name: $pod_name"
87+
echo "Waiting on kubernetes job: $job_name"
88+
status_code=2 &&
89+
# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
90+
printf "Waiting for job to finish: "
91+
while true; do
92+
if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then
93+
status_code=1 && break;
94+
elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then
95+
status_code=0 && break;
96+
else
97+
printf ".";
98+
fi;
99+
sleep 5;
100+
done
101+
echo "Done waiting. Job status code: $status_code"
102+
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
103+
grep '<?xml version="1.0" ?>' /tmp/full_output.txt # sanity check
104+
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'
105+
# REVERT ME
106+
ls
107+
cat xx01
108+
exit $status_code
112109
shell: bash
113110

114111
- name: Statistics
115112
if: success()
116113
run: |
117-
mv ./xx01 coverage.xml
114+
mv xx01 coverage.xml
118115
pip install coverage -q
119116
coverage report
120117
coverage xml

0 commit comments

Comments
 (0)