diff --git a/doc/frameworks/djl/using_djl.rst b/doc/frameworks/djl/using_djl.rst index dc77cae405..0582b7ef3e 100644 --- a/doc/frameworks/djl/using_djl.rst +++ b/doc/frameworks/djl/using_djl.rst @@ -31,7 +31,7 @@ You can either deploy your model using DeepSpeed or HuggingFace Accelerate, or l djl_model = DJLModel( "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id "my_sagemaker_role", - data_type="fp16", + dtype="fp16", task="text-generation", number_of_partitions=2 # number of gpus to partition the model across ) @@ -48,7 +48,7 @@ If you want to use a specific backend, then you can create an instance of the co deepspeed_model = DeepSpeedModel( "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id "my_sagemaker_role", - data_type="bf16", + dtype="bf16", task="text-generation", tensor_parallel_degree=2, # number of gpus to partition the model across using tensor parallelism ) @@ -58,7 +58,7 @@ If you want to use a specific backend, then you can create an instance of the co hf_accelerate_model = HuggingFaceAccelerateModel( "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id "my_sagemaker_role", - data_type="fp16", + dtype="fp16", task="text-generation", number_of_partitions=2, # number of gpus to partition the model across ) @@ -109,7 +109,7 @@ For example, you can deploy the EleutherAI gpt-j-6B model like this: model = DJLModel( "EleutherAI/gpt-j-6B", "my_sagemaker_role", - data_type="fp16", + dtype="fp16", number_of_partitions=2 ) @@ -142,7 +142,7 @@ You would then pass "s3://my_bucket/gpt-j-6B" as ``model_id`` to the ``DJLModel` model = DJLModel( "s3://my_bucket/gpt-j-6B", "my_sagemaker_role", - data_type="fp16", + dtype="fp16", number_of_partitions=2 ) diff --git a/src/sagemaker/djl_inference/model.py b/src/sagemaker/djl_inference/model.py index 89f15a54ab..512b54e0c9 100644 --- a/src/sagemaker/djl_inference/model.py +++ b/src/sagemaker/djl_inference/model.py @@ -233,7 +233,7 @@ def __init__( role: str, djl_version: Optional[str] = None, task: Optional[str] = None, - data_type: str = "fp32", + dtype: str = "fp32", number_of_partitions: Optional[int] = None, min_workers: Optional[int] = None, max_workers: Optional[int] = None, @@ -264,7 +264,7 @@ def __init__( task (str): The HuggingFace/NLP task you want to launch this model for. Defaults to None. If not provided, the task will be inferred from the model architecture by DJL. - data_type (str): The data type to use for loading your model. Accepted values are + dtype (str): The data type to use for loading your model. Accepted values are "fp32", "fp16", "bf16", "int8". Defaults to "fp32". number_of_partitions (int): The number of GPUs to partition the model across. The partitioning strategy is determined by the selected backend. If DeepSpeed is @@ -322,13 +322,20 @@ def __init__( "You only need to set model_id and ensure it points to uncompressed model " "artifacts in s3, or a valid HuggingFace Hub model_id." ) + data_type = kwargs.pop("data_type", None) + if data_type: + logger.warning( + "data_type is being deprecated in favor of dtype. Please migrate use of data_type" + " to dtype. Support for data_type will be removed in a future release" + ) + dtype = dtype or data_type super(DJLModel, self).__init__( None, image_uri, role, entry_point, predictor_cls=predictor_cls, **kwargs ) self.model_id = model_id self.djl_version = djl_version self.task = task - self.data_type = data_type + self.dtype = dtype self.number_of_partitions = number_of_partitions self.min_workers = min_workers self.max_workers = max_workers @@ -372,7 +379,7 @@ def transformer(self, **_): "DJLModels do not currently support Batch Transform inference jobs" ) - def right_size(self, checkpoint_data_type: str): + def right_size(self, **_): """Not implemented. DJLModels do not support SageMaker Inference Recommendation Jobs. @@ -573,8 +580,8 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str] serving_properties["option.entryPoint"] = self.entry_point if self.task: serving_properties["option.task"] = self.task - if self.data_type: - serving_properties["option.dtype"] = self.data_type + if self.dtype: + serving_properties["option.dtype"] = self.dtype if self.min_workers: serving_properties["minWorkers"] = self.min_workers if self.max_workers: @@ -779,7 +786,7 @@ def __init__( None. load_in_8bit (bool): Whether to load the model in int8 precision using bits and bytes quantization. This is only supported for select model architectures. - Defaults to False. If ``data_type`` is int8, then this is set to True. + Defaults to False. If ``dtype`` is int8, then this is set to True. low_cpu_mem_usage (bool): Whether to limit CPU memory usage to 1x model size during model loading. This is an experimental feature in HuggingFace. This is useful when loading multiple instances of your model in parallel. Defaults to False. @@ -832,10 +839,10 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str] if self.device_map: serving_properties["option.device_map"] = self.device_map if self.load_in_8bit: - if self.data_type != "int8": - raise ValueError("Set data_type='int8' to use load_in_8bit") + if self.dtype != "int8": + raise ValueError("Set dtype='int8' to use load_in_8bit") serving_properties["option.load_in_8bit"] = self.load_in_8bit - if self.data_type == "int8": + if self.dtype == "int8": serving_properties["option.load_in_8bit"] = True if self.low_cpu_mem_usage: serving_properties["option.low_cpu_mem_usage"] = self.low_cpu_mem_usage @@ -843,8 +850,8 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str] # TODO: This needs to be fixed when new dlc is published if ( serving_properties["option.entryPoint"] == "djl_python.huggingface" - and self.data_type - and self.data_type != "auto" + and self.dtype + and self.dtype != "auto" ): serving_properties["option.dtype"] = "auto" serving_properties.pop("option.load_in_8bit", None) diff --git a/tests/unit/test_djl_inference.py b/tests/unit/test_djl_inference.py index c4f03ae502..cb28183afa 100644 --- a/tests/unit/test_djl_inference.py +++ b/tests/unit/test_djl_inference.py @@ -351,12 +351,12 @@ def test_generate_huggingface_serving_properties_invalid_configurations( VALID_UNCOMPRESSED_MODEL_DATA, ROLE, sagemaker_session=sagemaker_session, - data_type="fp16", + dtype="fp16", load_in_8bit=True, ) with pytest.raises(ValueError) as invalid_config: _ = model.generate_serving_properties() - assert str(invalid_config.value).startswith("Set data_type='int8' to use load_in_8bit") + assert str(invalid_config.value).startswith("Set dtype='int8' to use load_in_8bit") model = HuggingFaceAccelerateModel( VALID_UNCOMPRESSED_MODEL_DATA, @@ -391,7 +391,7 @@ def test_generate_serving_properties_with_valid_configurations( min_workers=1, max_workers=3, job_queue_size=4, - data_type="fp16", + dtype="fp16", parallel_loading=True, model_loading_timeout=120, prediction_timeout=4, @@ -429,7 +429,7 @@ def test_generate_serving_properties_with_valid_configurations( sagemaker_session=sagemaker_session, tensor_parallel_degree=1, task="text-generation", - data_type="bf16", + dtype="bf16", max_tokens=2048, low_cpu_mem_usage=True, enable_cuda_graph=True, @@ -459,7 +459,7 @@ def test_generate_serving_properties_with_valid_configurations( number_of_partitions=1, device_id=4, device_map="balanced", - data_type="fp32", + dtype="fp32", low_cpu_mem_usage=False, ) serving_properties = model.generate_serving_properties() @@ -513,7 +513,7 @@ def test_deploy_model_no_local_code( ROLE, sagemaker_session=sagemaker_session, number_of_partitions=4, - data_type="fp16", + dtype="fp16", container_log_level=logging.DEBUG, env=ENV, )