diff --git a/learntools/ml_explainability/ex3.py b/learntools/ml_explainability/ex3.py index ac76f744a..0aeab0ea1 100644 --- a/learntools/ml_explainability/ex3.py +++ b/learntools/ml_explainability/ex3.py @@ -5,15 +5,14 @@ from learntools.core import * +# 1 class WhyThatUShape(ThoughtExperiment): _solution = \ """ The code is for feat_name in base_features: - pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X, - model_features=base_features, feature=feat_name) - pdp.pdp_plot(pdp_dist, feat_name) + PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name]) plt.show() @@ -25,6 +24,7 @@ class WhyThatUShape(ThoughtExperiment): For the same reason, we see the general U-shape in all our partial dependence plots. """ +# 2 class PonderPDPContour(ThoughtExperiment): _solution = \ """ @@ -40,46 +40,66 @@ class PonderPDPContour(ThoughtExperiment): The code you need to create the desired plot is: - fnames = ['pickup_longitude', 'dropoff_longitude'] - longitudes_partial_plot = pdp.pdp_interact(model=first_model, dataset=val_X, - model_features=base_features, features=fnames) - pdp.pdp_interact_plot(pdp_interact_out=longitudes_partial_plot, - feature_names=fnames, plot_type='contour') + fig, ax = plt.subplots(figsize=(8, 6)) + fnames = [('pickup_longitude', 'dropoff_longitude')] + disp = PartialDependenceDisplay.from_estimator(first_model, val_X, fnames, ax=ax) plt.show() """ +# 3 class ReadPDPContour(CodingProblem): _var = 'savings_from_shorter_trip' - _hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the white contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny' - _solution = 'About \$15. The price decreases from slightly more than \$24 to slightly more than \$9.' + _hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny' + _solution = 'About 6. The price decreases from slightly less than 15 to slightly less than 9.' def check(self, savings): if type(savings) == str: savings = Decimal(dollars.strip('$')) - assert ((savings > 13) and (savings < 17)), "Your answer should be about 15. Not {}".format(savings) + assert ((savings > 4) and (savings < 8)), "Your answer should be about 6. Not {}".format(savings) +# 4 class MakePDPWithAbsFeatures(CodingProblem): - _var = 'pdp_dist' - _hint = 'use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.' + _var = 'disp' + _hint = 'Use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.' _solution = \ """ -The biggest difference is that the partial dependence plot became much smaller. The the lowest vertical value is about $15 below the highest vertical value in the top chart, whereas this difference is only about $3 in the chart you just created. In other words, once you control for absolute distance traveled, the pickup_longitude has only a very small impact on predictions. +The difference is that the partial dependence plot became smaller. Both plots have a lowest vertical value of 8.5. But, the highest vertical value in the top chart is around 10.7, and the highest vertical value in the bottom chart is below 9.1. In other words, once you control for absolute distance traveled, the pickup_longitude has a smaller impact on predictions. # create new features data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude) data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude) """ - - def check(self, pdp_result): - correct = np.array([9.92212681, 8.97384862, 8.80044327, 8.71024292, 8.71564739, - 8.73523192, 8.76626448, 8.87855912, 9.00098688, 10.99584622]) - submitted = pdp_result.pdp + + def check(self, disp): + correct = np.array([8.730515 , 8.73239078, 8.71804165, 8.72179009, 8.93013488, + 8.68796391, 8.6773792 , 8.6816932 , 8.67547295, 8.64980733, + 8.64402745, 8.65616918, 8.63485345, 8.60505726, 8.59167824, + 8.57101857, 8.55601734, 8.55780041, 8.53660205, 8.53548254, + 8.50739547, 8.50599988, 8.50685068, 8.51981394, 8.52555708, + 8.50483315, 8.53151955, 8.49615781, 8.49384454, 8.49156773, + 8.5123399 , 8.47138576, 8.47491902, 8.50240045, 8.50495725, + 8.50433279, 8.4941558 , 8.50175984, 8.50394946, 8.50890372, + 8.50606589, 8.48335522, 8.48281078, 8.4730394 , 8.47720942, + 8.47699659, 8.52118039, 8.50234077, 8.59717268, 8.51092865, + 8.51177667, 8.51159374, 8.51159432, 8.54379423, 8.50500559, + 8.50631149, 8.52264825, 8.51989952, 8.52841122, 8.52757692, + 8.54425047, 8.56425312, 8.56874055, 8.58372296, 8.5589557 , + 8.57709991, 8.57441775, 8.59449221, 8.60063777, 8.62185164, + 8.6155473 , 8.6118143 , 8.61590988, 8.60758597, 8.62013413, + 8.6334263 , 8.64035478, 8.65324115, 8.66043255, 8.67502176, + 8.68940416, 8.6840402 , 8.67197893, 8.65512484, 8.66810839, + 8.6614093 , 8.65865671, 8.66485738, 8.67966737, 8.82833712, + 9.04135448, 9.03734449, 8.69506545, 8.70261503, 8.70673595, + 8.69045255, 8.69679997, 8.70716659, 8.71006281, 8.71739009]) + submitted = disp.pd_results[0]['average'][0] assert np.allclose(submitted, correct, rtol=0.1) +# 5 class DoesSteepnessImplyImportance(ThoughtExperiment): _solution = "No. This doesn't guarantee `feat_a` is more important. For example, `feat_a` could have a big effect in the cases where it varies, but could have a single value 99\% of the time. In that case, permuting `feat_a` wouldn't matter much, since most values would be unchanged." +# 6 class DesignDatasetUShapedPdp(CodingProblem): - _var = 'pdp_dist' + _var = 'disp' _hint = "Consider explicitly using terms that include mathematical expressions like `(X1 < -1)`" _solution = CS( """ @@ -89,22 +109,26 @@ class DesignDatasetUShapedPdp(CodingProblem): # You don't need any more changes """) - def check(self, pdp_result): - segment_1_end = np.argmin(pdp_result.feature_grids<-1) - segment_3_start = np.argmax(pdp_result.feature_grids>1) + def check(self, disp): + pdp_result = disp.pd_results[0] + x_values = pdp_result['values'][0] + y_values = pdp_result['average'][0] + + segment_1_end = np.argmin(x_values<-1) + segment_3_start = np.argmax(x_values>1) segment_2_start = segment_1_end + 1 segment_2_end = segment_3_start - 1 - segment_1_slopes_down = pdp_result.pdp[0] > pdp_result.pdp[segment_1_end] - segment_2_slopes_up = pdp_result.pdp[segment_2_start] < pdp_result.pdp[segment_2_end] - segment_3_slopes_down = pdp_result.pdp[segment_3_start] > pdp_result.pdp[-1] + segment_1_slopes_down = y_values[0] > y_values[segment_1_end] + segment_2_slopes_up = y_values[segment_2_start] < y_values[segment_2_end] + segment_3_slopes_down = y_values[segment_3_start] > y_values[-1] assert segment_1_slopes_down, ("The partial dependence plot does not slope down for values below -1.") assert segment_2_slopes_up, ("The partial dependence plot does not slope up for values between -1 and 1.") assert segment_3_slopes_down, ("The partial dependence plot does not slope down for values above 1.") class DesignFlatPDPWithHighImportance(CodingProblem): - _vars = ['perm', 'pdp_dist'] + _vars = ['perm', 'disp'] _hint = "You need for X1 to affect the prediction in order to have it affect permutation importance. But the average effect needs to be 0 to satisfy the PDP requirement. Achieve this by creating an interaction, so the effect of X1 depends on the value of X2 and vice-versa." _solution = CS( """ @@ -117,9 +141,10 @@ class DesignFlatPDPWithHighImportance(CodingProblem): # Aside from these lines, use the code provided """) - def check(self, importance, pdpResult): + def check(self, importance, disp): X1_imp = importance.feature_importances_[0] - pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp) + pdpResult = disp.pd_results[0]['average'][0] + pdpRange = max(pdpResult) - min(pdpResult) assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. " "Actual importance was {}").format(X1_imp) assert (pdpRange < 0.5), ("Tested that the highest point on the Partial " diff --git a/notebooks/ml_explainability/raw/ex3_partial_plots.ipynb b/notebooks/ml_explainability/raw/ex3_partial_plots.ipynb index 2db42d4b1..a599cfe29 100644 --- a/notebooks/ml_explainability/raw/ex3_partial_plots.ipynb +++ b/notebooks/ml_explainability/raw/ex3_partial_plots.ipynb @@ -17,6 +17,8 @@ "metadata": {}, "outputs": [], "source": [ + "# Get most recent checking code\n", + "!pip install -U -t /kaggle/working/ git+https://github.com/Kaggle/learntools.git\n", "import pandas as pd\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", @@ -70,7 +72,7 @@ "source": [ "## Question 1\n", "\n", - "Here is the code to plot the partial dependence plot for pickup_longitude. Run the following cell." + "Here is the code to plot the partial dependence plot for `pickup_longitude`. Run the following cell without changes." ] }, { @@ -80,12 +82,10 @@ "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", - "from pdpbox import pdp, get_dataset, info_plots\n", + "from sklearn.inspection import PartialDependenceDisplay\n", "\n", "feat_name = 'pickup_longitude'\n", - "pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)\n", - "\n", - "pdp.pdp_plot(pdp_dist, feat_name)\n", + "PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])\n", "plt.show()" ] }, @@ -107,7 +107,6 @@ "outputs": [], "source": [ "for feat_name in base_features:\n", - " pdp_dist = ____\n", " ____\n", " plt.show()" ] @@ -140,13 +139,13 @@ "Now you will run a 2D partial dependence plot. As a reminder, here is the code from the tutorial. \n", "\n", "```\n", - "inter1 = pdp.pdp_interact(model=my_model, dataset=val_X, model_features=feature_names, features=['Goal Scored', 'Distance Covered (Kms)'])\n", - "\n", - "pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['Goal Scored', 'Distance Covered (Kms)'], plot_type='contour')\n", + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "f_names = [('Goal Scored', 'Distance Covered (Kms)')]\n", + "PartialDependenceDisplay.from_estimator(tree_model, val_X, f_names, ax=ax)\n", "plt.show()\n", "```\n", "\n", - "Create a 2D plot for the features `pickup_longitude` and `dropoff_longitude`. Plot it appropriately?\n", + "Create a 2D plot for the features `pickup_longitude` and `dropoff_longitude`.\n", "\n", "What do you expect it to look like?" ] @@ -157,6 +156,8 @@ "metadata": {}, "outputs": [], "source": [ + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "\n", "# Add your code here\n", "____" ] @@ -183,7 +184,7 @@ "metadata": {}, "source": [ "## Question 3\n", - "Consider a ride starting at longitude -73.92 and ending at longitude -74. Using the graph from the last question, estimate how much money the rider would have saved if they'd started the ride at longitude -73.98 instead?" + "Consider a ride starting at longitude -73.955 and ending at longitude -74. Using the graph from the last question, estimate how much money the rider would have saved if they'd started the ride at longitude -73.98 instead." ] }, { @@ -237,13 +238,10 @@ "source": [ "# This is the PDP for pickup_longitude without the absolute difference features. Included here to help compare it to the new PDP you create\n", "feat_name = 'pickup_longitude'\n", - "pdp_dist_original = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)\n", - "\n", - "pdp.pdp_plot(pdp_dist_original, feat_name)\n", + "PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])\n", "plt.show()\n", "\n", - "\n", - "\n", + "# Your code here\n", "# create new features\n", "data['abs_lon_change'] = ____\n", "data['abs_lat_change'] = ____\n", @@ -260,9 +258,7 @@ "second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)\n", "\n", "feat_name = 'pickup_longitude'\n", - "pdp_dist = pdp.pdp_isolate(model=second_model, dataset=new_val_X, model_features=features_2, feature=feat_name)\n", - "\n", - "pdp.pdp_plot(pdp_dist, feat_name)\n", + "disp = PartialDependenceDisplay.from_estimator(second_model, new_val_X, [feat_name])\n", "plt.show()\n", "\n", "# Check your answer\n", @@ -273,7 +269,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Uncomment the lines below to see a hint or the solution (including an explanation of the important differences between the plots)." + "Uncomment the line below to see a hint or the solution (including an explanation of the important differences between the plots)." ] }, { @@ -339,19 +335,17 @@ "# Create array holding predictive feature\n", "X1 = 4 * rand(n_samples) - 2\n", "X2 = 4 * rand(n_samples) - 2\n", + "\n", + "# Your code here\n", "# Create y. you should have X1 and X2 in the expression for y\n", "y = np.ones(n_samples)\n", "\n", - "# create dataframe because pdp_isolate expects a dataFrame as an argument\n", + "# create dataframe \n", "my_df = pd.DataFrame({'X1': X1, 'X2': X2, 'y': y})\n", "predictors_df = my_df.drop(['y'], axis=1)\n", "\n", "my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)\n", - "\n", - "pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')\n", - "\n", - "# visualize your results\n", - "pdp.pdp_plot(pdp_dist, 'X1')\n", + "disp = PartialDependenceDisplay.from_estimator(my_model, predictors_df, ['X1'])\n", "plt.show()\n", "\n", "# Check your answer\n", @@ -410,8 +404,7 @@ "my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)\n", "\n", "\n", - "pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')\n", - "pdp.pdp_plot(pdp_dist, 'X1')\n", + "disp = PartialDependenceDisplay.from_estimator(my_model, predictors_df, ['X1'], grid_resolution=300)\n", "plt.show()\n", "\n", "perm = PermutationImportance(my_model).fit(predictors_df, my_df.y)\n", @@ -448,7 +441,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/notebooks/ml_explainability/raw/tut3_partial_plots.ipynb b/notebooks/ml_explainability/raw/tut3_partial_plots.ipynb index d11747442..2fafca360 100644 --- a/notebooks/ml_explainability/raw/tut3_partial_plots.ipynb +++ b/notebooks/ml_explainability/raw/tut3_partial_plots.ipynb @@ -88,8 +88,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "Here is the code to create the Partial Dependence Plot using the [PDPBox library](https://pdpbox.readthedocs.io/en/latest/)." + "Here is the code to create the Partial Dependence Plot using the scikit-learn library." ] }, { @@ -99,13 +98,10 @@ "outputs": [], "source": [ "from matplotlib import pyplot as plt\n", - "from pdpbox import pdp, get_dataset, info_plots\n", - "\n", - "# Create the data that we will plot\n", - "pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature='Goal Scored')\n", + "from sklearn.inspection import PartialDependenceDisplay\n", "\n", - "# plot it\n", - "pdp.pdp_plot(pdp_goals, 'Goal Scored')\n", + "# Create and plot the data\n", + "disp1 = PartialDependenceDisplay.from_estimator(tree_model, val_X, ['Goal Scored'])\n", "plt.show()" ] }, @@ -113,9 +109,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A few items are worth pointing out as you interpret this plot\n", - "- The y axis is interpreted as **change in the prediction** from what it would be predicted at the baseline or leftmost value.\n", - "- A blue shaded area indicates level of confidence\n", + "The y axis is interpreted as **change in the prediction** from what it would be predicted at the baseline or leftmost value.\n", "\n", "From this particular graph, we see that scoring a goal substantially increases your chances of winning \"Man of The Match.\" But extra goals beyond that appear to have little impact on predictions.\n", "\n", @@ -129,9 +123,7 @@ "outputs": [], "source": [ "feature_to_plot = 'Distance Covered (Kms)'\n", - "pdp_dist = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)\n", - "\n", - "pdp.pdp_plot(pdp_dist, feature_to_plot)\n", + "disp2 = PartialDependenceDisplay.from_estimator(tree_model, val_X, [feature_to_plot])\n", "plt.show()" ] }, @@ -153,9 +145,7 @@ "# Build Random Forest model\n", "rf_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)\n", "\n", - "pdp_dist = pdp.pdp_isolate(model=rf_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)\n", - "\n", - "pdp.pdp_plot(pdp_dist, feature_to_plot)\n", + "disp3 = PartialDependenceDisplay.from_estimator(rf_model, val_X, [feature_to_plot])\n", "plt.show()" ] }, @@ -179,11 +169,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot\n", - "features_to_plot = ['Goal Scored', 'Distance Covered (Kms)']\n", - "inter1 = pdp.pdp_interact(model=tree_model, dataset=val_X, model_features=feature_names, features=features_to_plot)\n", - "\n", - "pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')\n", + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "f_names = [('Goal Scored', 'Distance Covered (Kms)')]\n", + "# Similar to previous PDP plot except we use tuple of features instead of single feature\n", + "disp4 = PartialDependenceDisplay.from_estimator(tree_model, val_X, f_names, ax=ax)\n", "plt.show()" ] }, @@ -209,7 +198,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/notebooks/ml_explainability/track_meta.py b/notebooks/ml_explainability/track_meta.py index 1a806b81b..9e31243ef 100644 --- a/notebooks/ml_explainability/track_meta.py +++ b/notebooks/ml_explainability/track_meta.py @@ -41,7 +41,8 @@ filename='ex3_partial_plots.ipynb', lesson_idx=2, type='exercise', - scriptid=1637380 + scriptid=1637380, + enable_internet=True ), dict( filename='tut4_shap_basic.ipynb', diff --git a/notebooks/test.sh b/notebooks/test.sh index 754b50b8e..3a020601b 100755 --- a/notebooks/test.sh +++ b/notebooks/test.sh @@ -11,8 +11,8 @@ if [[ -r /etc/git_commit ]]; then fi # Filter by tracks if first argument set. -TRACKS="intro_to_programming time_series ethics feature_engineering_new computer_vision deep_learning_intro pandas python machine_learning sql data_viz_to_coder ml_intermediate sql_advanced feature_engineering geospatial nlp game_ai data_cleaning" -TESTABLE_NOTEBOOK_TRACKS="intro_to_programming geospatial time_series ethics feature_engineering_new data_viz_to_coder ml_intermediate data_cleaning computer_vision deep_learning_intro python pandas machine_learning game_ai" +TRACKS="ml_explainability intro_to_programming time_series ethics feature_engineering_new computer_vision deep_learning_intro pandas python machine_learning sql data_viz_to_coder ml_intermediate sql_advanced feature_engineering geospatial nlp game_ai data_cleaning" +TESTABLE_NOTEBOOK_TRACKS="ml_explainability intro_to_programming geospatial time_series ethics feature_engineering_new data_viz_to_coder ml_intermediate data_cleaning computer_vision deep_learning_intro python pandas machine_learning game_ai" if [[ -n $1 && $1 != "all" ]]; then TRACKS=$1 @@ -100,7 +100,8 @@ do || [[ ( $nb =~ "tut1" && $track == "computer_vision" ) ]] || [[ ( $nb =~ "tut5" && $track == "computer_vision" ) ]] || [[ ( $nb =~ "tut6" && $track == "computer_vision" ) ]] \ || [[ ( $nb =~ "ex1" && $track == "computer_vision" ) ]] || [[ ( $nb =~ "ex5" && $track == "computer_vision" ) ]] || [[ ( $nb =~ "ex6" && $track == "computer_vision" ) ]] \ || [[ ( $nb =~ "ex1" && $track == "computer_vision" ) ]] \ - || [[ ( $nb =~ "ex2" && $track == "nlp" ) ]] #times out + || [[ ( $nb =~ "ex2" && $track == "nlp" ) ]] \ + || [[ ( $nb =~ "ex3_partial_plots" && $track == "ml_explainability" ) ]] then echo "Warning: skipping $nb in track $track" continue