Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 54 additions & 29 deletions learntools/ml_explainability/ex3.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@

from learntools.core import *

# 1
class WhyThatUShape(ThoughtExperiment):
_solution = \
"""
The code is

for feat_name in base_features:
pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X,
model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])
plt.show()


Expand All @@ -25,6 +24,7 @@ class WhyThatUShape(ThoughtExperiment):
For the same reason, we see the general U-shape in all our partial dependence plots.
"""

# 2
class PonderPDPContour(ThoughtExperiment):
_solution = \
"""
Expand All @@ -40,46 +40,66 @@ class PonderPDPContour(ThoughtExperiment):

The code you need to create the desired plot is:

fnames = ['pickup_longitude', 'dropoff_longitude']
longitudes_partial_plot = pdp.pdp_interact(model=first_model, dataset=val_X,
model_features=base_features, features=fnames)
pdp.pdp_interact_plot(pdp_interact_out=longitudes_partial_plot,
feature_names=fnames, plot_type='contour')
fig, ax = plt.subplots(figsize=(8, 6))
fnames = [('pickup_longitude', 'dropoff_longitude')]
disp = PartialDependenceDisplay.from_estimator(first_model, val_X, fnames, ax=ax)
plt.show()
"""

# 3
class ReadPDPContour(CodingProblem):
_var = 'savings_from_shorter_trip'
_hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the white contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
_solution = 'About \$15. The price decreases from slightly more than \$24 to slightly more than \$9.'
_hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
_solution = 'About 6. The price decreases from slightly less than 15 to slightly less than 9.'
def check(self, savings):
if type(savings) == str:
savings = Decimal(dollars.strip('$'))
assert ((savings > 13) and (savings < 17)), "Your answer should be about 15. Not {}".format(savings)
assert ((savings > 4) and (savings < 8)), "Your answer should be about 6. Not {}".format(savings)

# 4
class MakePDPWithAbsFeatures(CodingProblem):
_var = 'pdp_dist'
_hint = 'use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
_var = 'disp'
_hint = 'Use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
_solution = \
"""
The biggest difference is that the partial dependence plot became much smaller. The the lowest vertical value is about $15 below the highest vertical value in the top chart, whereas this difference is only about $3 in the chart you just created. In other words, once you control for absolute distance traveled, the pickup_longitude has only a very small impact on predictions.
The difference is that the partial dependence plot became smaller. Both plots have a lowest vertical value of 8.5. But, the highest vertical value in the top chart is around 10.7, and the highest vertical value in the bottom chart is below 9.1. In other words, once you control for absolute distance traveled, the pickup_longitude has a smaller impact on predictions.

# create new features
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)
"""

def check(self, pdp_result):
correct = np.array([9.92212681, 8.97384862, 8.80044327, 8.71024292, 8.71564739,
8.73523192, 8.76626448, 8.87855912, 9.00098688, 10.99584622])
submitted = pdp_result.pdp

def check(self, disp):
correct = np.array([8.730515 , 8.73239078, 8.71804165, 8.72179009, 8.93013488,
8.68796391, 8.6773792 , 8.6816932 , 8.67547295, 8.64980733,
8.64402745, 8.65616918, 8.63485345, 8.60505726, 8.59167824,
8.57101857, 8.55601734, 8.55780041, 8.53660205, 8.53548254,
8.50739547, 8.50599988, 8.50685068, 8.51981394, 8.52555708,
8.50483315, 8.53151955, 8.49615781, 8.49384454, 8.49156773,
8.5123399 , 8.47138576, 8.47491902, 8.50240045, 8.50495725,
8.50433279, 8.4941558 , 8.50175984, 8.50394946, 8.50890372,
8.50606589, 8.48335522, 8.48281078, 8.4730394 , 8.47720942,
8.47699659, 8.52118039, 8.50234077, 8.59717268, 8.51092865,
8.51177667, 8.51159374, 8.51159432, 8.54379423, 8.50500559,
8.50631149, 8.52264825, 8.51989952, 8.52841122, 8.52757692,
8.54425047, 8.56425312, 8.56874055, 8.58372296, 8.5589557 ,
8.57709991, 8.57441775, 8.59449221, 8.60063777, 8.62185164,
8.6155473 , 8.6118143 , 8.61590988, 8.60758597, 8.62013413,
8.6334263 , 8.64035478, 8.65324115, 8.66043255, 8.67502176,
8.68940416, 8.6840402 , 8.67197893, 8.65512484, 8.66810839,
8.6614093 , 8.65865671, 8.66485738, 8.67966737, 8.82833712,
9.04135448, 9.03734449, 8.69506545, 8.70261503, 8.70673595,
8.69045255, 8.69679997, 8.70716659, 8.71006281, 8.71739009])
submitted = disp.pd_results[0]['average'][0]
assert np.allclose(submitted, correct, rtol=0.1)

# 5
class DoesSteepnessImplyImportance(ThoughtExperiment):
_solution = "No. This doesn't guarantee `feat_a` is more important. For example, `feat_a` could have a big effect in the cases where it varies, but could have a single value 99\% of the time. In that case, permuting `feat_a` wouldn't matter much, since most values would be unchanged."

# 6
class DesignDatasetUShapedPdp(CodingProblem):
_var = 'pdp_dist'
_var = 'disp'
_hint = "Consider explicitly using terms that include mathematical expressions like `(X1 < -1)`"
_solution = CS(
"""
Expand All @@ -89,22 +109,26 @@ class DesignDatasetUShapedPdp(CodingProblem):
# You don't need any more changes
""")

def check(self, pdp_result):
segment_1_end = np.argmin(pdp_result.feature_grids<-1)
segment_3_start = np.argmax(pdp_result.feature_grids>1)
def check(self, disp):
pdp_result = disp.pd_results[0]
x_values = pdp_result['values'][0]
y_values = pdp_result['average'][0]

segment_1_end = np.argmin(x_values<-1)
segment_3_start = np.argmax(x_values>1)
segment_2_start = segment_1_end + 1
segment_2_end = segment_3_start - 1

segment_1_slopes_down = pdp_result.pdp[0] > pdp_result.pdp[segment_1_end]
segment_2_slopes_up = pdp_result.pdp[segment_2_start] < pdp_result.pdp[segment_2_end]
segment_3_slopes_down = pdp_result.pdp[segment_3_start] > pdp_result.pdp[-1]
segment_1_slopes_down = y_values[0] > y_values[segment_1_end]
segment_2_slopes_up = y_values[segment_2_start] < y_values[segment_2_end]
segment_3_slopes_down = y_values[segment_3_start] > y_values[-1]

assert segment_1_slopes_down, ("The partial dependence plot does not slope down for values below -1.")
assert segment_2_slopes_up, ("The partial dependence plot does not slope up for values between -1 and 1.")
assert segment_3_slopes_down, ("The partial dependence plot does not slope down for values above 1.")

class DesignFlatPDPWithHighImportance(CodingProblem):
_vars = ['perm', 'pdp_dist']
_vars = ['perm', 'disp']
_hint = "You need for X1 to affect the prediction in order to have it affect permutation importance. But the average effect needs to be 0 to satisfy the PDP requirement. Achieve this by creating an interaction, so the effect of X1 depends on the value of X2 and vice-versa."
_solution = CS(
"""
Expand All @@ -117,9 +141,10 @@ class DesignFlatPDPWithHighImportance(CodingProblem):
# Aside from these lines, use the code provided
""")

def check(self, importance, pdpResult):
def check(self, importance, disp):
X1_imp = importance.feature_importances_[0]
pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp)
pdpResult = disp.pd_results[0]['average'][0]
pdpRange = max(pdpResult) - min(pdpResult)
assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. "
"Actual importance was {}").format(X1_imp)
assert (pdpRange < 0.5), ("Tested that the highest point on the Partial "
Expand Down
51 changes: 22 additions & 29 deletions notebooks/ml_explainability/raw/ex3_partial_plots.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Get most recent checking code\n",
"!pip install -U -t /kaggle/working/ git+https://github.com/Kaggle/learntools.git\n",
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
Expand Down Expand Up @@ -70,7 +72,7 @@
"source": [
"## Question 1\n",
"\n",
"Here is the code to plot the partial dependence plot for pickup_longitude. Run the following cell."
"Here is the code to plot the partial dependence plot for `pickup_longitude`. Run the following cell without changes."
]
},
{
Expand All @@ -80,12 +82,10 @@
"outputs": [],
"source": [
"from matplotlib import pyplot as plt\n",
"from pdpbox import pdp, get_dataset, info_plots\n",
"from sklearn.inspection import PartialDependenceDisplay\n",
"\n",
"feat_name = 'pickup_longitude'\n",
"pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)\n",
"\n",
"pdp.pdp_plot(pdp_dist, feat_name)\n",
"PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])\n",
"plt.show()"
]
},
Expand All @@ -107,7 +107,6 @@
"outputs": [],
"source": [
"for feat_name in base_features:\n",
" pdp_dist = ____\n",
" ____\n",
" plt.show()"
]
Expand Down Expand Up @@ -140,13 +139,13 @@
"Now you will run a 2D partial dependence plot. As a reminder, here is the code from the tutorial. \n",
"\n",
"```\n",
"inter1 = pdp.pdp_interact(model=my_model, dataset=val_X, model_features=feature_names, features=['Goal Scored', 'Distance Covered (Kms)'])\n",
"\n",
"pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['Goal Scored', 'Distance Covered (Kms)'], plot_type='contour')\n",
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"f_names = [('Goal Scored', 'Distance Covered (Kms)')]\n",
"PartialDependenceDisplay.from_estimator(tree_model, val_X, f_names, ax=ax)\n",
"plt.show()\n",
"```\n",
"\n",
"Create a 2D plot for the features `pickup_longitude` and `dropoff_longitude`. Plot it appropriately?\n",
"Create a 2D plot for the features `pickup_longitude` and `dropoff_longitude`.\n",
"\n",
"What do you expect it to look like?"
]
Expand All @@ -157,6 +156,8 @@
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"\n",
"# Add your code here\n",
"____"
]
Expand All @@ -183,7 +184,7 @@
"metadata": {},
"source": [
"## Question 3\n",
"Consider a ride starting at longitude -73.92 and ending at longitude -74. Using the graph from the last question, estimate how much money the rider would have saved if they'd started the ride at longitude -73.98 instead?"
"Consider a ride starting at longitude -73.955 and ending at longitude -74. Using the graph from the last question, estimate how much money the rider would have saved if they'd started the ride at longitude -73.98 instead."
]
},
{
Expand Down Expand Up @@ -237,13 +238,10 @@
"source": [
"# This is the PDP for pickup_longitude without the absolute difference features. Included here to help compare it to the new PDP you create\n",
"feat_name = 'pickup_longitude'\n",
"pdp_dist_original = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)\n",
"\n",
"pdp.pdp_plot(pdp_dist_original, feat_name)\n",
"PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])\n",
"plt.show()\n",
"\n",
"\n",
"\n",
"# Your code here\n",
"# create new features\n",
"data['abs_lon_change'] = ____\n",
"data['abs_lat_change'] = ____\n",
Expand All @@ -260,9 +258,7 @@
"second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)\n",
"\n",
"feat_name = 'pickup_longitude'\n",
"pdp_dist = pdp.pdp_isolate(model=second_model, dataset=new_val_X, model_features=features_2, feature=feat_name)\n",
"\n",
"pdp.pdp_plot(pdp_dist, feat_name)\n",
"disp = PartialDependenceDisplay.from_estimator(second_model, new_val_X, [feat_name])\n",
"plt.show()\n",
"\n",
"# Check your answer\n",
Expand All @@ -273,7 +269,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Uncomment the lines below to see a hint or the solution (including an explanation of the important differences between the plots)."
"Uncomment the line below to see a hint or the solution (including an explanation of the important differences between the plots)."
]
},
{
Expand Down Expand Up @@ -339,19 +335,17 @@
"# Create array holding predictive feature\n",
"X1 = 4 * rand(n_samples) - 2\n",
"X2 = 4 * rand(n_samples) - 2\n",
"\n",
"# Your code here\n",
"# Create y. you should have X1 and X2 in the expression for y\n",
"y = np.ones(n_samples)\n",
"\n",
"# create dataframe because pdp_isolate expects a dataFrame as an argument\n",
"# create dataframe \n",
"my_df = pd.DataFrame({'X1': X1, 'X2': X2, 'y': y})\n",
"predictors_df = my_df.drop(['y'], axis=1)\n",
"\n",
"my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)\n",
"\n",
"pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')\n",
"\n",
"# visualize your results\n",
"pdp.pdp_plot(pdp_dist, 'X1')\n",
"disp = PartialDependenceDisplay.from_estimator(my_model, predictors_df, ['X1'])\n",
"plt.show()\n",
"\n",
"# Check your answer\n",
Expand Down Expand Up @@ -410,8 +404,7 @@
"my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)\n",
"\n",
"\n",
"pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')\n",
"pdp.pdp_plot(pdp_dist, 'X1')\n",
"disp = PartialDependenceDisplay.from_estimator(my_model, predictors_df, ['X1'], grid_resolution=300)\n",
"plt.show()\n",
"\n",
"perm = PermutationImportance(my_model).fit(predictors_df, my_df.y)\n",
Expand Down Expand Up @@ -448,7 +441,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand Down
35 changes: 12 additions & 23 deletions notebooks/ml_explainability/raw/tut3_partial_plots.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Here is the code to create the Partial Dependence Plot using the [PDPBox library](https://pdpbox.readthedocs.io/en/latest/)."
"Here is the code to create the Partial Dependence Plot using the scikit-learn library."
]
},
{
Expand All @@ -99,23 +98,18 @@
"outputs": [],
"source": [
"from matplotlib import pyplot as plt\n",
"from pdpbox import pdp, get_dataset, info_plots\n",
"\n",
"# Create the data that we will plot\n",
"pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature='Goal Scored')\n",
"from sklearn.inspection import PartialDependenceDisplay\n",
"\n",
"# plot it\n",
"pdp.pdp_plot(pdp_goals, 'Goal Scored')\n",
"# Create and plot the data\n",
"disp1 = PartialDependenceDisplay.from_estimator(tree_model, val_X, ['Goal Scored'])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A few items are worth pointing out as you interpret this plot\n",
"- The y axis is interpreted as **change in the prediction** from what it would be predicted at the baseline or leftmost value.\n",
"- A blue shaded area indicates level of confidence\n",
"The y axis is interpreted as **change in the prediction** from what it would be predicted at the baseline or leftmost value.\n",
"\n",
"From this particular graph, we see that scoring a goal substantially increases your chances of winning \"Man of The Match.\" But extra goals beyond that appear to have little impact on predictions.\n",
"\n",
Expand All @@ -129,9 +123,7 @@
"outputs": [],
"source": [
"feature_to_plot = 'Distance Covered (Kms)'\n",
"pdp_dist = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)\n",
"\n",
"pdp.pdp_plot(pdp_dist, feature_to_plot)\n",
"disp2 = PartialDependenceDisplay.from_estimator(tree_model, val_X, [feature_to_plot])\n",
"plt.show()"
]
},
Expand All @@ -153,9 +145,7 @@
"# Build Random Forest model\n",
"rf_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)\n",
"\n",
"pdp_dist = pdp.pdp_isolate(model=rf_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)\n",
"\n",
"pdp.pdp_plot(pdp_dist, feature_to_plot)\n",
"disp3 = PartialDependenceDisplay.from_estimator(rf_model, val_X, [feature_to_plot])\n",
"plt.show()"
]
},
Expand All @@ -179,11 +169,10 @@
"metadata": {},
"outputs": [],
"source": [
"# Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot\n",
"features_to_plot = ['Goal Scored', 'Distance Covered (Kms)']\n",
"inter1 = pdp.pdp_interact(model=tree_model, dataset=val_X, model_features=feature_names, features=features_to_plot)\n",
"\n",
"pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')\n",
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"f_names = [('Goal Scored', 'Distance Covered (Kms)')]\n",
"# Similar to previous PDP plot except we use tuple of features instead of single feature\n",
"disp4 = PartialDependenceDisplay.from_estimator(tree_model, val_X, f_names, ax=ax)\n",
"plt.show()"
]
},
Expand All @@ -209,7 +198,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand Down
Loading