@@ -88,10 +88,24 @@ def get_feature_group_as_dataframe(
8888 event_time_feature_name : str = None ,
8989 latest_ingestion : bool = True ,
9090 verbose : bool = True ,
91- ** pandas_read_csv_kwargs ,
91+ ** kwargs ,
9292) -> DataFrame :
9393 """Get a :class:`sagemaker.feature_store.feature_group.FeatureGroup` as a pandas.DataFrame
9494
95+ Examples:
96+ >>> from sagemaker.feature_store.feature_utils import get_feature_group_as_dataframe
97+ >>>
98+ >>> region = "eu-west-1"
99+ >>> fg_data = get_feature_group_as_dataframe(feature_group_name="feature_group",
100+ >>> athena_bucket="s3://bucket/athena_queries",
101+ >>> region=region,
102+ >>> event_time_feature_name="EventTimeId"
103+ >>> )
104+ >>>
105+ >>> type(fg_data)
106+ <class 'pandas.core.frame.DataFrame'>
107+ >>>
108+
95109 Description:
96110 Method to run an athena query over a Feature Group in a Feature Store
97111 to retrieve its data.It needs the sagemaker.Session linked to a role
@@ -106,17 +120,22 @@ def get_feature_group_as_dataframe(
106120 in the feature group that wasn't deleted. It needs to use the keyword
107121 "#{table}" to refer to the FeatureGroup name. e.g.:
108122 'SELECT * FROM "sagemaker_featurestore"."#{table}"'
123+ It must not end by ';'.
109124 athena_bucket (str): Amazon S3 bucket for running the query
110- role (str): role of the account used to extract data from feature store
111- session (str): :class:`sagemaker.session.Session`
112- of SageMaker used to work with the feature store
125+ role (str): role to be assumed to extract data from feature store. If not specified
126+ the default sagemaker execution role will be used.
127+ session (str): `:obj:sagemaker.session.Session`
128+ of SageMaker used to work with the feature store. Optional, with
129+ role and region parameters it will infer the session.
113130 event_time_feature_name (str): eventTimeId feature. Mandatory only if the
114- latest ingestion is True
131+ latest ingestion is True.
115132 latest_ingestion (bool): if True it will get the data only from the latest ingestion.
116133 If False it will take whatever is specified in the query, or
117134 if not specify it, it will get all the data that wasn't deleted.
118135 verbose (bool): if True show messages, if False is silent.
119-
136+ **kwargs (object): key arguments used for the method pandas.read_csv to be able to
137+ have a better tuning on data. For more info read:
138+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
120139 Returns:
121140 dataset (pandas.DataFrame): dataset with the data retrieved from feature group
122141 """
@@ -139,12 +158,13 @@ def get_feature_group_as_dataframe(
139158 )
140159 logger .exception (exc )
141160 raise exc
161+
142162 query += ";"
143163
144164 if session is not None :
145165 sagemaker_session = session
146- elif role is not None and region is not None :
147- sagemaker_session = get_session_from_role (region = region )
166+ elif region is not None :
167+ sagemaker_session = get_session_from_role (region = region , assume_role = role )
148168 else :
149169 exc = Exception ("Argument Session or role and region must be specified." )
150170 logger .exception (exc )
@@ -166,7 +186,7 @@ def get_feature_group_as_dataframe(
166186 sample_query .wait ()
167187
168188 # run Athena query. The output is loaded to a Pandas dataframe.
169- dataset = sample_query .as_dataframe (** pandas_read_csv_kwargs )
189+ dataset = sample_query .as_dataframe (** kwargs )
170190
171191 msg = f"Data shape retrieve from { feature_group_name } : { dataset .shape } "
172192 logger .info (msg )
@@ -217,7 +237,7 @@ def prepare_fg_from_dataframe_or_file(
217237 record_id : str = "record_id" ,
218238 event_id : str = "data_as_of_date" ,
219239 verbose : bool = False ,
220- ** pandas_read_csv_kwargs ,
240+ ** kwargs ,
221241) -> FeatureGroup :
222242 """Prepares a dataframe to create a :class:`sagemaker.feature_store.feature_group.FeatureGroup`
223243
@@ -229,7 +249,9 @@ def prepare_fg_from_dataframe_or_file(
229249 by default with the names 'record_id' and 'data_as_of_date'.
230250
231251 Args:
232- **pandas_read_csv_kwargs (object):
252+ **kwargs (object): key arguments used for the method pandas.read_csv to be able to
253+ have a better tuning on data. For more info read:
254+ https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
233255 feature_group_name (str): feature group name
234256 dataframe_or_path (str, Path, pandas.DataFrame) : pandas.DataFrame or path to the data
235257 verbose (bool) : True for displaying messages, False for silent method.
@@ -256,8 +278,8 @@ def prepare_fg_from_dataframe_or_file(
256278 if isinstance (dataframe_or_path , DataFrame ):
257279 data = dataframe_or_path
258280 elif isinstance (dataframe_or_path , str ):
259- pandas_read_csv_kwargs .pop ("filepath_or_buffer" , None )
260- data = read_csv (filepath_or_buffer = dataframe_or_path , ** pandas_read_csv_kwargs )
281+ kwargs .pop ("filepath_or_buffer" , None )
282+ data = read_csv (filepath_or_buffer = dataframe_or_path , ** kwargs )
261283 else :
262284 exc = Exception (
263285 str (
0 commit comments