@@ -52,6 +52,7 @@ class DJLServingEngineEntryPointDefaults(Enum):
5252 DEEPSPEED = ("DeepSpeed" , "djl_python.deepspeed" )
5353 HUGGINGFACE_ACCELERATE = ("Python" , "djl_python.huggingface" )
5454 STABLE_DIFFUSION = ("DeepSpeed" , "djl_python.stable-diffusion" )
55+ FASTER_TRANSFORMER = ("FasterTransformer" , "djl_python.fastertransformer" )
5556
5657
5758class DJLPredictor (Predictor ):
@@ -93,30 +94,34 @@ def __init__(
9394def _determine_engine_for_model (model_type : str , num_partitions : int , num_heads : int ):
9495 """Placeholder docstring"""
9596
96- # Tensor Parallelism with DeepSpeed is only possible if attention heads can be split evenly
97+ # Tensor Parallelism is only possible if attention heads can be split evenly
9798 # across devices
9899 if num_heads is not None and num_partitions is not None and num_heads % num_partitions :
99100 return HuggingFaceAccelerateModel
100101 if model_type in defaults .DEEPSPEED_RECOMMENDED_ARCHITECTURES :
101102 return DeepSpeedModel
103+ if model_type in defaults .FASTER_TRANSFORMER_RECOMMENDED_ARCHITECTURES :
104+ return FasterTransformerModel
102105 return HuggingFaceAccelerateModel
103106
104107
105108def _validate_engine_for_model_type (cls , model_type : str , num_partitions : int , num_heads : int ):
106109 """Placeholder docstring"""
107110
108111 if cls == DeepSpeedModel :
109- if model_type not in defaults .DEEPSPEED_SUPPORTED_ARCHITECTURES :
110- raise ValueError (
111- f"{ model_type } is not supported by DeepSpeed. "
112- f"Supported model_types are { defaults .DEEPSPEED_SUPPORTED_ARCHITECTURES } "
113- )
114112 if num_heads is not None and num_partitions is not None and num_heads % num_partitions :
115113 raise ValueError (
116114 "The number of attention heads is not evenly divisible by the number of partitions."
117115 "Please set the number of partitions such that the number of attention heads can be"
118116 "evenly split across the partitions."
119117 )
118+ if cls == FasterTransformerModel :
119+ if model_type not in defaults .FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES :
120+ raise ValueError (
121+ f"The model architecture { model_type } is currently not supported by "
122+ f"FasterTransformer. Please use a different engine, or use the DJLModel"
123+ f"to let SageMaker pick a recommended engine for this model."
124+ )
120125 return cls
121126
122127
@@ -223,6 +228,8 @@ def __new__(
223228 instance .engine = DJLServingEngineEntryPointDefaults .STABLE_DIFFUSION
224229 elif isinstance (instance , DeepSpeedModel ):
225230 instance .engine = DJLServingEngineEntryPointDefaults .DEEPSPEED
231+ elif isinstance (instance , FasterTransformerModel ):
232+ instance .engine = DJLServingEngineEntryPointDefaults .FASTER_TRANSFORMER
226233 else :
227234 instance .engine = DJLServingEngineEntryPointDefaults .HUGGINGFACE_ACCELERATE
228235 return instance
@@ -849,3 +856,62 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
849856 serving_properties ["option.dtype" ] = "auto"
850857 serving_properties .pop ("option.load_in_8bit" , None )
851858 return serving_properties
859+
860+
861+ class FasterTransformerModel (DJLModel ):
862+
863+ """A DJL FasterTransformer SageMaker ``Model`` that can be deployed to a SageMaker ``Endpoint``."""
864+
865+ _framework_name = "djl-fastertransformer"
866+
867+ def __init__ (
868+ self ,
869+ model_id : str ,
870+ role : str ,
871+ tensor_parallel_degree : Optional [int ] = None ,
872+ ** kwargs ,
873+ ):
874+ """Initialize a FasterTransformerModel.
875+
876+ Args:
877+ model_id (str): This is either the HuggingFace Hub model_id, or the Amazon S3 location
878+ containing the uncompressed model artifacts (i.e. not a tar.gz file).
879+ The model artifacts are expected to be in HuggingFace pre-trained model
880+ format (i.e. model should be loadable from the huggingface transformers
881+ from_pretrained api, and should also include tokenizer configs if applicable).
882+ role (str): An AWS IAM role specified with either the name or full ARN. The Amazon
883+ SageMaker training jobs and APIs that create Amazon SageMaker
884+ endpoints use this role to access model artifacts. After the endpoint is created,
885+ the inference code
886+ might use the IAM role, if it needs to access an AWS resource.
887+ tensor_parllel_degree (int): The number of gpus to shard a single instance of the
888+ model across via tensor_parallelism. This should be set to greater than 1 if the
889+ size of the model is larger than the memory available on a single GPU on the
890+ instance. Defaults to None. If not set, no tensor parallel sharding is done.
891+ **kwargs: Keyword arguments passed to the superclasses
892+ :class:`~sagemaker.djl_inference.DJLModel`,
893+ :class:`~sagemaker.model.FrameworkModel`, and
894+ :class:`~sagemaker.model.Model`
895+
896+ .. tip::
897+
898+ You can find additional parameters for initializing this class at
899+ :class:`~sagemaker.djl_inference.DJLModel`,
900+ :class:`~sagemaker.model.FrameworkModel`, and
901+ :class:`~sagemaker.model.Model`.
902+ """
903+
904+ super (FasterTransformerModel , self ).__init__ (
905+ model_id ,
906+ role ,
907+ ** kwargs ,
908+ )
909+ if self .number_of_partitions and tensor_parallel_degree :
910+ logger .warning (
911+ "Both number_of_partitions and tensor_parallel_degree have been set for "
912+ "FasterTransformerModel."
913+ "These mean the same thing for FasterTransformerModel. Please only set "
914+ "tensor_parallel_degree."
915+ "number_of_partitions will be ignored"
916+ )
917+ self .number_of_partitions = tensor_parallel_degree or self .number_of_partitions
0 commit comments