@@ -291,6 +291,16 @@ jobs:
291291 bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
292292 echo "::endgroup::"
293293
294+ echo "::group::Run inference with quantize file"
295+ for DEVICE in cpu; do # cuda
296+ # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
297+ # follow up with torchao as a separate PR
298+ echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
299+ python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
300+ python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
301+ done
302+ echo "::endgroup::"
303+
294304 test-gpu-aoti-float32 :
295305 permissions :
296306 id-token : write
@@ -335,6 +345,11 @@ jobs:
335345 fi
336346 echo "::endgroup::"
337347
348+ # echo "::group::Run inference with quantize file"
349+ # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
350+ # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
351+ # echo "::endgroup::"
352+
338353 test-gpu-aoti-float16 :
339354 permissions :
340355 id-token : write
@@ -376,10 +391,15 @@ jobs:
376391 echo "::group::Run inference with quantize file"
377392 if [ $(uname -s) == Darwin ]; then
378393 python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
379- python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
394+ python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
380395 fi
381396 echo "::endgroup::"
382397
398+ # echo "::group::Run inference with quantize file"
399+ # python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
400+ # python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
401+ # echo "::endgroup::"
402+
383403 test-gpu-eval-sanity-check :
384404 permissions :
385405 id-token : write
@@ -495,12 +515,12 @@ jobs:
495515 python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
496516
497517 echo "******************************************"
498- echo "*** --quantize torchchat/quant_config/mobile.json ***"
518+ echo "*** [TEST DISABLED] Can't test --quantize torchchat/quant_config/mobile.json ***"
519+ echo "*** Testing --quantize torchchat/quant_config/mobile-32.json instead ***"
499520 echo "******************************************"
500- # python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
521+ # python torchchat.py export --quantize torchchat/quant_config/mobile-32 .json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
501522 # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
502523
503-
504524 echo "******************************************"
505525 echo "******* Emb: channel-wise quantized ******"
506526 echo "******************************************"
@@ -514,16 +534,16 @@ jobs:
514534 python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
515535
516536 echo "******************************************"
517- echo "**** Emb 4bit: channel-wise quantized ****"
537+ echo "**** [TEST DISABLED] Emb 4bit: channel-wise quantized ****"
518538 echo "******************************************"
519- python torchchat.py export --quant '{"embedding" : {"bitwidth": 8 , "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
520- python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
539+ # python torchchat.py export --quant '{"embedding" : {"bitwidth": 4 , "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
540+ # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
521541
522542 echo "******************************************"
523- echo "****** Emb 4bit: group-wise quantized ****"
543+ echo "****** [TEST DISABLED] Emb 4bit: group-wise quantized ****"
524544 echo "******************************************"
525- python torchchat.py export --quant '{"embedding" : {"bitwidth": 8 , "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
526- python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
545+ # python torchchat.py export --quant '{"embedding" : {"bitwidth": 4 , "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
546+ # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
527547
528548 echo "******************************************"
529549 echo "******* INT8 channel-wise quantized ******"
@@ -1055,7 +1075,59 @@ jobs:
10551075 ./runner/build_android.sh
10561076 echo "Tests complete."
10571077
1058- test-torchao-experimental :
1078+ test-torchao-aoti-experimental :
1079+ strategy :
1080+ matrix :
1081+ runner : [macos-14-xlarge]
1082+ runs-on : ${{matrix.runner}}
1083+ steps :
1084+ - name : Checkout repo
1085+ uses : actions/checkout@v3
1086+ with :
1087+ submodules : true
1088+ - name : Setup Python
1089+ uses : actions/setup-python@v2
1090+ with :
1091+ python-version : 3.10.11
1092+ - name : Setup Xcode
1093+ if : runner.os == 'macOS'
1094+ uses : maxim-lobanov/setup-xcode@v1
1095+ with :
1096+ xcode-version : ' 15.3'
1097+ - name : Print machine info
1098+ run : |
1099+ uname -a
1100+ if [ $(uname -s) == Darwin ]; then
1101+ sysctl machdep.cpu.brand_string
1102+ sysctl machdep.cpu.core_count
1103+ fi
1104+ - name : Install torchchat
1105+ run : |
1106+ echo "Intalling pip3 packages"
1107+ ./install/install_requirements.sh
1108+ pip3 list
1109+ python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
1110+ - name : Install torchao-ops
1111+ id : install-torchao-ops
1112+ run : |
1113+ bash torchchat/utils/scripts/build_torchao_ops.sh
1114+ - name : Install runner AOTI
1115+ id : install-runner-aoti
1116+ run : |
1117+ bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
1118+ - name : Run inference
1119+ run : |
1120+ python torchchat.py download stories110M
1121+ wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
1122+ export PRMT="Once upon a time in a land far away"
1123+ echo "Export and run AOTI (C++ runner)"
1124+ python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1125+ ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1126+ echo "Generate AOTI"
1127+ python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
1128+ echo "Tests complete."
1129+
1130+ test-torchao-et-experimental :
10591131 strategy :
10601132 matrix :
10611133 runner : [macos-14-xlarge]
@@ -1100,10 +1172,6 @@ jobs:
11001172 run : |
11011173 echo "Installing runner"
11021174 bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
1103- - name : Install runner AOTI
1104- id : install-runner-aoti
1105- run : |
1106- bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
11071175 - name : Run inference
11081176 run : |
11091177 python torchchat.py download stories110M
@@ -1116,11 +1184,6 @@ jobs:
11161184 echo "Export and run ET (C++ runner)"
11171185 python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11181186 ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
1119- echo "Export and run AOTI (C++ runner)"
1120- python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1121- ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1122- echo "Generate AOTI"
1123- python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
11241187 echo "Tests complete."
11251188
11261189 test-torchao-experimental-mps :
0 commit comments