@@ -717,3 +717,267 @@ def test_validate_precision_type(tmpdir, precision):
717717def test_amp_level_raises_error_with_native (tmpdir ):
718718 with pytest .raises (MisconfigurationException , match = "not supported with `amp_backend='native'`" ):
719719 _ = Trainer (default_root_dir = tmpdir , gpus = 1 , amp_level = "O2" , amp_backend = "native" , precision = 16 )
720+
721+
722+ def test_strategy_choice_ddp_spawn_cpu (tmpdir ):
723+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 )
724+ assert isinstance (trainer .accelerator , CPUAccelerator )
725+ assert isinstance (trainer .training_type_plugin , DDPSpawnPlugin )
726+ assert isinstance (trainer .training_type_plugin .cluster_environment , LightningEnvironment )
727+
728+
729+ @mock .patch .dict (os .environ , {"CUDA_VISIBLE_DEVICES" : "0,1" })
730+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
731+ @mock .patch ("torch.cuda.is_available" , return_value = True )
732+ def test_strategy_choice_ddp (cuda_available_mock , device_count_mock ):
733+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 1 )
734+ assert isinstance (trainer .accelerator , GPUAccelerator )
735+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
736+ assert isinstance (trainer .training_type_plugin .cluster_environment , LightningEnvironment )
737+
738+
739+ @mock .patch .dict (os .environ , {"CUDA_VISIBLE_DEVICES" : "0,1" })
740+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
741+ @mock .patch ("torch.cuda.is_available" , return_value = True )
742+ def test_strategy_choice_ddp_spawn (cuda_available_mock , device_count_mock ):
743+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , gpus = 1 )
744+ assert isinstance (trainer .accelerator , GPUAccelerator )
745+ assert isinstance (trainer .training_type_plugin , DDPSpawnPlugin )
746+ assert isinstance (trainer .training_type_plugin .cluster_environment , LightningEnvironment )
747+
748+
749+ @RunIf (min_gpus = 2 )
750+ @mock .patch .dict (
751+ os .environ ,
752+ {
753+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
754+ "SLURM_NTASKS" : "2" ,
755+ "SLURM_JOB_NAME" : "SOME_NAME" ,
756+ "SLURM_NODEID" : "0" ,
757+ "SLURM_PROCID" : "1" ,
758+ "SLURM_LOCALID" : "1" ,
759+ },
760+ )
761+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
762+ def test_strategy_choice_ddp_slurm (setup_distributed_mock ):
763+ class CB (Callback ):
764+ def on_fit_start (self , trainer , pl_module ):
765+ assert trainer .accelerator_connector .is_slurm_managing_tasks
766+ assert isinstance (trainer .accelerator , GPUAccelerator )
767+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
768+ assert isinstance (trainer .training_type_plugin .cluster_environment , SLURMEnvironment )
769+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
770+ assert trainer .training_type_plugin .task_idx == 1
771+ raise SystemExit ()
772+
773+ model = BoringModel ()
774+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 2 , callbacks = [CB ()])
775+
776+ with pytest .raises (SystemExit ):
777+ trainer .fit (model )
778+
779+
780+ @RunIf (min_gpus = 2 )
781+ @mock .patch .dict (
782+ os .environ ,
783+ {
784+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
785+ "SLURM_NTASKS" : "2" ,
786+ "SLURM_JOB_NAME" : "SOME_NAME" ,
787+ "SLURM_NODEID" : "0" ,
788+ "SLURM_PROCID" : "1" ,
789+ "SLURM_LOCALID" : "1" ,
790+ },
791+ )
792+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
793+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
794+ def test_strategy_choice_ddp2_slurm (device_count_mock , setup_distributed_mock ):
795+ class CB (Callback ):
796+ def on_fit_start (self , trainer , pl_module ):
797+ assert trainer .accelerator_connector .is_slurm_managing_tasks
798+ assert isinstance (trainer .accelerator , GPUAccelerator )
799+ assert isinstance (trainer .training_type_plugin , DDP2Plugin )
800+ assert isinstance (trainer .training_type_plugin .cluster_environment , SLURMEnvironment )
801+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
802+ assert trainer .training_type_plugin .task_idx == 1
803+ raise SystemExit ()
804+
805+ model = BoringModel ()
806+ trainer = Trainer (fast_dev_run = True , strategy = "ddp2" , gpus = 2 , callbacks = [CB ()])
807+
808+ with pytest .raises (SystemExit ):
809+ trainer .fit (model )
810+
811+
812+ @RunIf (min_gpus = 1 )
813+ @mock .patch .dict (
814+ os .environ ,
815+ {
816+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
817+ "WORLD_SIZE" : "2" ,
818+ "LOCAL_WORLD_SIZE" : "2" ,
819+ "RANK" : "1" ,
820+ "LOCAL_RANK" : "1" ,
821+ "GROUP_RANK" : "0" ,
822+ },
823+ )
824+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
825+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
826+ def test_strategy_choice_ddp_te (device_count_mock , setup_distributed_mock ):
827+ class CB (Callback ):
828+ def on_fit_start (self , trainer , pl_module ):
829+ assert isinstance (trainer .accelerator , GPUAccelerator )
830+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
831+ assert isinstance (trainer .training_type_plugin .cluster_environment , TorchElasticEnvironment )
832+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
833+ assert trainer .training_type_plugin .task_idx == 1
834+ raise SystemExit ()
835+
836+ model = BoringModel ()
837+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 2 , callbacks = [CB ()])
838+
839+ with pytest .raises (SystemExit ):
840+ trainer .fit (model )
841+
842+
843+ @RunIf (min_gpus = 1 )
844+ @mock .patch .dict (
845+ os .environ ,
846+ {
847+ "CUDA_VISIBLE_DEVICES" : "0,1" ,
848+ "WORLD_SIZE" : "2" ,
849+ "LOCAL_WORLD_SIZE" : "2" ,
850+ "RANK" : "1" ,
851+ "LOCAL_RANK" : "1" ,
852+ "GROUP_RANK" : "0" ,
853+ },
854+ )
855+ @mock .patch ("torch.cuda.device_count" , return_value = 2 )
856+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
857+ def test_strategy_choice_ddp2_te (device_count_mock , setup_distributed_mock ):
858+ class CB (Callback ):
859+ def on_fit_start (self , trainer , pl_module ):
860+ assert isinstance (trainer .accelerator , GPUAccelerator )
861+ assert isinstance (trainer .training_type_plugin , DDP2Plugin )
862+ assert isinstance (trainer .training_type_plugin .cluster_environment , TorchElasticEnvironment )
863+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
864+ assert trainer .training_type_plugin .task_idx == 1
865+ raise SystemExit ()
866+
867+ model = BoringModel ()
868+ trainer = Trainer (fast_dev_run = True , strategy = "ddp2" , gpus = 2 , callbacks = [CB ()])
869+
870+ with pytest .raises (SystemExit ):
871+ trainer .fit (model )
872+
873+
874+ @mock .patch .dict (
875+ os .environ , {"WORLD_SIZE" : "2" , "LOCAL_WORLD_SIZE" : "2" , "RANK" : "1" , "LOCAL_RANK" : "1" , "GROUP_RANK" : "0" }
876+ )
877+ @mock .patch ("torch.cuda.device_count" , return_value = 0 )
878+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
879+ def test_strategy_choice_ddp_cpu_te (device_count_mock , setup_distributed_mock ):
880+ class CB (Callback ):
881+ def on_fit_start (self , trainer , pl_module ):
882+ assert isinstance (trainer .accelerator , CPUAccelerator )
883+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
884+ assert isinstance (trainer .training_type_plugin .cluster_environment , TorchElasticEnvironment )
885+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 1
886+ assert trainer .training_type_plugin .task_idx == 1
887+ raise SystemExit ()
888+
889+ model = BoringModel ()
890+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 , callbacks = [CB ()])
891+
892+ with pytest .raises (SystemExit ):
893+ trainer .fit (model )
894+
895+
896+ @RunIf (min_gpus = 1 )
897+ @mock .patch .dict (
898+ os .environ ,
899+ {
900+ "CUDA_VISIBLE_DEVICES" : "0" ,
901+ "KUBERNETES_PORT" : "tcp://127.0.0.1:443" ,
902+ "MASTER_ADDR" : "1.2.3.4" ,
903+ "MASTER_PORT" : "500" ,
904+ "WORLD_SIZE" : "20" ,
905+ "RANK" : "1" ,
906+ },
907+ )
908+ @mock .patch ("torch.cuda.device_count" , return_value = 1 )
909+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
910+ def test_strategy_choice_ddp_kubeflow (device_count_mock , setup_distributed_mock ):
911+ class CB (Callback ):
912+ def on_fit_start (self , trainer , pl_module ):
913+ assert isinstance (trainer .accelerator , GPUAccelerator )
914+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
915+ assert isinstance (trainer .training_type_plugin .cluster_environment , KubeflowEnvironment )
916+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 0
917+ assert trainer .training_type_plugin .task_idx == 0
918+ raise SystemExit ()
919+
920+ model = BoringModel ()
921+ trainer = Trainer (fast_dev_run = True , strategy = "ddp" , gpus = 1 , callbacks = [CB ()])
922+
923+ with pytest .raises (SystemExit ):
924+ trainer .fit (model )
925+
926+
927+ @mock .patch .dict (
928+ os .environ ,
929+ {
930+ "KUBERNETES_PORT" : "tcp://127.0.0.1:443" ,
931+ "MASTER_ADDR" : "1.2.3.4" ,
932+ "MASTER_PORT" : "500" ,
933+ "WORLD_SIZE" : "20" ,
934+ "RANK" : "1" ,
935+ },
936+ )
937+ @mock .patch ("torch.cuda.device_count" , return_value = 0 )
938+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
939+ def test_strategy_choice_ddp_cpu_kubeflow (device_count_mock , setup_distributed_mock ):
940+ class CB (Callback ):
941+ def on_fit_start (self , trainer , pl_module ):
942+ assert isinstance (trainer .accelerator , CPUAccelerator )
943+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
944+ assert isinstance (trainer .training_type_plugin .cluster_environment , KubeflowEnvironment )
945+ assert trainer .training_type_plugin .cluster_environment .local_rank () == 0
946+ assert trainer .training_type_plugin .task_idx == 0
947+ raise SystemExit ()
948+
949+ model = BoringModel ()
950+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 , callbacks = [CB ()])
951+
952+ with pytest .raises (SystemExit ):
953+ trainer .fit (model )
954+
955+
956+ @mock .patch .dict (
957+ os .environ ,
958+ {
959+ "SLURM_NTASKS" : "2" ,
960+ "SLURM_JOB_NAME" : "SOME_NAME" ,
961+ "SLURM_NODEID" : "0" ,
962+ "LOCAL_RANK" : "0" ,
963+ "SLURM_PROCID" : "0" ,
964+ "SLURM_LOCALID" : "0" ,
965+ },
966+ )
967+ @mock .patch ("torch.cuda.device_count" , return_value = 0 )
968+ @mock .patch ("pytorch_lightning.plugins.DDPPlugin.setup_distributed" , autospec = True )
969+ def test_strategy_choice_ddp_cpu_slurm (device_count_mock , setup_distributed_mock ):
970+ class CB (Callback ):
971+ def on_fit_start (self , trainer , pl_module ):
972+ assert trainer .accelerator_connector .is_slurm_managing_tasks
973+ assert isinstance (trainer .accelerator , CPUAccelerator )
974+ assert isinstance (trainer .training_type_plugin , DDPPlugin )
975+ assert isinstance (trainer .training_type_plugin .cluster_environment , SLURMEnvironment )
976+ assert trainer .training_type_plugin .task_idx == 0
977+ raise SystemExit ()
978+
979+ model = BoringModel ()
980+ trainer = Trainer (fast_dev_run = True , strategy = "ddp_spawn" , num_processes = 2 , callbacks = [CB ()])
981+
982+ with pytest .raises (SystemExit ):
983+ trainer .fit (model )
0 commit comments