@@ -298,9 +298,17 @@ jobs:
298298 python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda-32.json --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
299299 python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
300300
301- fi
301+ fi
302+
303+ for DEVICE in cpu; do # cuda
304+ # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
305+ # follow up with torchao as a separate PR
306+ echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
307+ python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
308+ python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
309+ done
302310 echo "::endgroup::"
303-
311+
304312 test-gpu-aoti-float32 :
305313 permissions :
306314 id-token : write
@@ -349,6 +357,11 @@ jobs:
349357 fi
350358 echo "::endgroup::"
351359
360+ # echo "::group::Run inference with quantize file"
361+ # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
362+ # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
363+ # echo "::endgroup::"
364+
352365 test-gpu-aoti-float16 :
353366 permissions :
354367 id-token : write
@@ -394,6 +407,11 @@ jobs:
394407 fi
395408 echo "::endgroup::"
396409
410+ # echo "::group::Run inference with quantize file"
411+ # python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
412+ # python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
413+ # echo "::endgroup::"
414+
397415 test-gpu-eval-sanity-check :
398416 permissions :
399417 id-token : write
@@ -509,12 +527,12 @@ jobs:
509527 python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
510528
511529 echo "******************************************"
512- echo "*** --quantize torchchat/quant_config/mobile.json ***"
530+ echo "*** [TEST DISABLED] Can't test --quantize torchchat/quant_config/mobile.json ***"
531+ echo "*** Testing --quantize torchchat/quant_config/mobile-32.json instead ***"
513532 echo "******************************************"
514- # python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
533+ # python torchchat.py export --quantize torchchat/quant_config/mobile-32 .json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
515534 # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
516535
517-
518536 echo "******************************************"
519537 echo "******* Emb: channel-wise quantized ******"
520538 echo "******************************************"
@@ -528,16 +546,16 @@ jobs:
528546 python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
529547
530548 echo "******************************************"
531- echo "**** Emb 4bit: channel-wise quantized ****"
549+ echo "**** [TEST DISABLED] Emb 4bit: channel-wise quantized ****"
532550 echo "******************************************"
533- python torchchat.py export --quant '{"embedding" : {"bitwidth": 8 , "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
534- python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
551+ # python torchchat.py export --quant '{"embedding" : {"bitwidth": 4 , "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
552+ # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
535553
536554 echo "******************************************"
537- echo "****** Emb 4bit: group-wise quantized ****"
555+ echo "****** [TEST DISABLED] Emb 4bit: group-wise quantized ****"
538556 echo "******************************************"
539- python torchchat.py export --quant '{"embedding" : {"bitwidth": 8 , "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
540- python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
557+ # python torchchat.py export --quant '{"embedding" : {"bitwidth": 4 , "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
558+ # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
541559
542560 echo "******************************************"
543561 echo "******* INT8 channel-wise quantized ******"
@@ -1069,7 +1087,59 @@ jobs:
10691087 ./runner/build_android.sh
10701088 echo "Tests complete."
10711089
1072- test-torchao-experimental :
1090+ test-torchao-aoti-experimental :
1091+ strategy :
1092+ matrix :
1093+ runner : [macos-14-xlarge]
1094+ runs-on : ${{matrix.runner}}
1095+ steps :
1096+ - name : Checkout repo
1097+ uses : actions/checkout@v3
1098+ with :
1099+ submodules : true
1100+ - name : Setup Python
1101+ uses : actions/setup-python@v2
1102+ with :
1103+ python-version : 3.10.11
1104+ - name : Setup Xcode
1105+ if : runner.os == 'macOS'
1106+ uses : maxim-lobanov/setup-xcode@v1
1107+ with :
1108+ xcode-version : ' 15.3'
1109+ - name : Print machine info
1110+ run : |
1111+ uname -a
1112+ if [ $(uname -s) == Darwin ]; then
1113+ sysctl machdep.cpu.brand_string
1114+ sysctl machdep.cpu.core_count
1115+ fi
1116+ - name : Install torchchat
1117+ run : |
1118+ echo "Intalling pip3 packages"
1119+ ./install/install_requirements.sh
1120+ pip3 list
1121+ python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
1122+ - name : Install torchao-ops
1123+ id : install-torchao-ops
1124+ run : |
1125+ bash torchchat/utils/scripts/build_torchao_ops.sh
1126+ - name : Install runner AOTI
1127+ id : install-runner-aoti
1128+ run : |
1129+ bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
1130+ - name : Run inference
1131+ run : |
1132+ python torchchat.py download stories110M
1133+ wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
1134+ export PRMT="Once upon a time in a land far away"
1135+ echo "Export and run AOTI (C++ runner)"
1136+ python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1137+ ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1138+ echo "Generate AOTI"
1139+ python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
1140+ echo "Tests complete."
1141+
1142+ test-torchao-et-experimental :
10731143 strategy :
10741144 matrix :
10751145 runner : [macos-14-xlarge]
@@ -1114,10 +1184,6 @@ jobs:
11141184 run : |
11151185 echo "Installing runner"
11161186 bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
1117- - name : Install runner AOTI
1118- id : install-runner-aoti
1119- run : |
1120- bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
11211187 - name : Run inference
11221188 run : |
11231189 python torchchat.py download stories110M
@@ -1130,11 +1196,6 @@ jobs:
11301196 echo "Export and run ET (C++ runner)"
11311197 python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11321198 ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
1133- echo "Export and run AOTI (C++ runner)"
1134- python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1135- ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
1136- echo "Generate AOTI"
1137- python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
11381199 echo "Tests complete."
11391200
11401201 test-torchao-experimental-mps :
0 commit comments