Skip to content

Commit ea8ae24

Browse files
authored
Merge pull request #437 from Kaggle/fix-pdpbox
[ML Explainability] pdpbox no longer in docker image
2 parents 50dffce + 80edc8c commit ea8ae24

File tree

5 files changed

+94
-85
lines changed

5 files changed

+94
-85
lines changed

learntools/ml_explainability/ex3.py

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55

66
from learntools.core import *
77

8+
# 1
89
class WhyThatUShape(ThoughtExperiment):
910
_solution = \
1011
"""
1112
The code is
1213
1314
for feat_name in base_features:
14-
pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X,
15-
model_features=base_features, feature=feat_name)
16-
pdp.pdp_plot(pdp_dist, feat_name)
15+
PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])
1716
plt.show()
1817
1918
@@ -25,6 +24,7 @@ class WhyThatUShape(ThoughtExperiment):
2524
For the same reason, we see the general U-shape in all our partial dependence plots.
2625
"""
2726

27+
# 2
2828
class PonderPDPContour(ThoughtExperiment):
2929
_solution = \
3030
"""
@@ -40,46 +40,66 @@ class PonderPDPContour(ThoughtExperiment):
4040
4141
The code you need to create the desired plot is:
4242
43-
fnames = ['pickup_longitude', 'dropoff_longitude']
44-
longitudes_partial_plot = pdp.pdp_interact(model=first_model, dataset=val_X,
45-
model_features=base_features, features=fnames)
46-
pdp.pdp_interact_plot(pdp_interact_out=longitudes_partial_plot,
47-
feature_names=fnames, plot_type='contour')
43+
fig, ax = plt.subplots(figsize=(8, 6))
44+
fnames = [('pickup_longitude', 'dropoff_longitude')]
45+
disp = PartialDependenceDisplay.from_estimator(first_model, val_X, fnames, ax=ax)
4846
plt.show()
4947
"""
5048

49+
# 3
5150
class ReadPDPContour(CodingProblem):
5251
_var = 'savings_from_shorter_trip'
53-
_hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the white contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
54-
_solution = 'About \$15. The price decreases from slightly more than \$24 to slightly more than \$9.'
52+
_hint = 'First find the vertical level corresponding to -74 dropoff longitude. Then read off the horizontal values you are switching between. Use the contour lines to orient yourself on what values you are near. You can round to the nearest integer rather than stressing about the exact cost to the nearest penny'
53+
_solution = 'About 6. The price decreases from slightly less than 15 to slightly less than 9.'
5554
def check(self, savings):
5655
if type(savings) == str:
5756
savings = Decimal(dollars.strip('$'))
58-
assert ((savings > 13) and (savings < 17)), "Your answer should be about 15. Not {}".format(savings)
57+
assert ((savings > 4) and (savings < 8)), "Your answer should be about 6. Not {}".format(savings)
5958

59+
# 4
6060
class MakePDPWithAbsFeatures(CodingProblem):
61-
_var = 'pdp_dist'
62-
_hint = 'use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
61+
_var = 'disp'
62+
_hint = 'Use the abs function when creating the abs_lat_change and abs_lon_change features. You don\'t need to change anything else.'
6363
_solution = \
6464
"""
65-
The biggest difference is that the partial dependence plot became much smaller. The the lowest vertical value is about $15 below the highest vertical value in the top chart, whereas this difference is only about $3 in the chart you just created. In other words, once you control for absolute distance traveled, the pickup_longitude has only a very small impact on predictions.
65+
The difference is that the partial dependence plot became smaller. Both plots have a lowest vertical value of 8.5. But, the highest vertical value in the top chart is around 10.7, and the highest vertical value in the bottom chart is below 9.1. In other words, once you control for absolute distance traveled, the pickup_longitude has a smaller impact on predictions.
6666
6767
# create new features
6868
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
6969
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)
7070
"""
71-
72-
def check(self, pdp_result):
73-
correct = np.array([9.92212681, 8.97384862, 8.80044327, 8.71024292, 8.71564739,
74-
8.73523192, 8.76626448, 8.87855912, 9.00098688, 10.99584622])
75-
submitted = pdp_result.pdp
71+
72+
def check(self, disp):
73+
correct = np.array([8.730515 , 8.73239078, 8.71804165, 8.72179009, 8.93013488,
74+
8.68796391, 8.6773792 , 8.6816932 , 8.67547295, 8.64980733,
75+
8.64402745, 8.65616918, 8.63485345, 8.60505726, 8.59167824,
76+
8.57101857, 8.55601734, 8.55780041, 8.53660205, 8.53548254,
77+
8.50739547, 8.50599988, 8.50685068, 8.51981394, 8.52555708,
78+
8.50483315, 8.53151955, 8.49615781, 8.49384454, 8.49156773,
79+
8.5123399 , 8.47138576, 8.47491902, 8.50240045, 8.50495725,
80+
8.50433279, 8.4941558 , 8.50175984, 8.50394946, 8.50890372,
81+
8.50606589, 8.48335522, 8.48281078, 8.4730394 , 8.47720942,
82+
8.47699659, 8.52118039, 8.50234077, 8.59717268, 8.51092865,
83+
8.51177667, 8.51159374, 8.51159432, 8.54379423, 8.50500559,
84+
8.50631149, 8.52264825, 8.51989952, 8.52841122, 8.52757692,
85+
8.54425047, 8.56425312, 8.56874055, 8.58372296, 8.5589557 ,
86+
8.57709991, 8.57441775, 8.59449221, 8.60063777, 8.62185164,
87+
8.6155473 , 8.6118143 , 8.61590988, 8.60758597, 8.62013413,
88+
8.6334263 , 8.64035478, 8.65324115, 8.66043255, 8.67502176,
89+
8.68940416, 8.6840402 , 8.67197893, 8.65512484, 8.66810839,
90+
8.6614093 , 8.65865671, 8.66485738, 8.67966737, 8.82833712,
91+
9.04135448, 9.03734449, 8.69506545, 8.70261503, 8.70673595,
92+
8.69045255, 8.69679997, 8.70716659, 8.71006281, 8.71739009])
93+
submitted = disp.pd_results[0]['average'][0]
7694
assert np.allclose(submitted, correct, rtol=0.1)
7795

96+
# 5
7897
class DoesSteepnessImplyImportance(ThoughtExperiment):
7998
_solution = "No. This doesn't guarantee `feat_a` is more important. For example, `feat_a` could have a big effect in the cases where it varies, but could have a single value 99\% of the time. In that case, permuting `feat_a` wouldn't matter much, since most values would be unchanged."
8099

100+
# 6
81101
class DesignDatasetUShapedPdp(CodingProblem):
82-
_var = 'pdp_dist'
102+
_var = 'disp'
83103
_hint = "Consider explicitly using terms that include mathematical expressions like `(X1 < -1)`"
84104
_solution = CS(
85105
"""
@@ -89,22 +109,26 @@ class DesignDatasetUShapedPdp(CodingProblem):
89109
# You don't need any more changes
90110
""")
91111

92-
def check(self, pdp_result):
93-
segment_1_end = np.argmin(pdp_result.feature_grids<-1)
94-
segment_3_start = np.argmax(pdp_result.feature_grids>1)
112+
def check(self, disp):
113+
pdp_result = disp.pd_results[0]
114+
x_values = pdp_result['values'][0]
115+
y_values = pdp_result['average'][0]
116+
117+
segment_1_end = np.argmin(x_values<-1)
118+
segment_3_start = np.argmax(x_values>1)
95119
segment_2_start = segment_1_end + 1
96120
segment_2_end = segment_3_start - 1
97121

98-
segment_1_slopes_down = pdp_result.pdp[0] > pdp_result.pdp[segment_1_end]
99-
segment_2_slopes_up = pdp_result.pdp[segment_2_start] < pdp_result.pdp[segment_2_end]
100-
segment_3_slopes_down = pdp_result.pdp[segment_3_start] > pdp_result.pdp[-1]
122+
segment_1_slopes_down = y_values[0] > y_values[segment_1_end]
123+
segment_2_slopes_up = y_values[segment_2_start] < y_values[segment_2_end]
124+
segment_3_slopes_down = y_values[segment_3_start] > y_values[-1]
101125

102126
assert segment_1_slopes_down, ("The partial dependence plot does not slope down for values below -1.")
103127
assert segment_2_slopes_up, ("The partial dependence plot does not slope up for values between -1 and 1.")
104128
assert segment_3_slopes_down, ("The partial dependence plot does not slope down for values above 1.")
105129

106130
class DesignFlatPDPWithHighImportance(CodingProblem):
107-
_vars = ['perm', 'pdp_dist']
131+
_vars = ['perm', 'disp']
108132
_hint = "You need for X1 to affect the prediction in order to have it affect permutation importance. But the average effect needs to be 0 to satisfy the PDP requirement. Achieve this by creating an interaction, so the effect of X1 depends on the value of X2 and vice-versa."
109133
_solution = CS(
110134
"""
@@ -117,9 +141,10 @@ class DesignFlatPDPWithHighImportance(CodingProblem):
117141
# Aside from these lines, use the code provided
118142
""")
119143

120-
def check(self, importance, pdpResult):
144+
def check(self, importance, disp):
121145
X1_imp = importance.feature_importances_[0]
122-
pdpRange = max(pdpResult.pdp) - min(pdpResult.pdp)
146+
pdpResult = disp.pd_results[0]['average'][0]
147+
pdpRange = max(pdpResult) - min(pdpResult)
123148
assert (X1_imp > 0.5), ("Tested that X1 has an importance > 0.5. "
124149
"Actual importance was {}").format(X1_imp)
125150
assert (pdpRange < 0.5), ("Tested that the highest point on the Partial "

notebooks/ml_explainability/raw/ex3_partial_plots.ipynb

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
"metadata": {},
1818
"outputs": [],
1919
"source": [
20+
"# Get most recent checking code\n",
21+
"!pip install -U -t /kaggle/working/ git+https://github.com/Kaggle/learntools.git\n",
2022
"import pandas as pd\n",
2123
"from sklearn.ensemble import RandomForestRegressor\n",
2224
"from sklearn.linear_model import LinearRegression\n",
@@ -70,7 +72,7 @@
7072
"source": [
7173
"## Question 1\n",
7274
"\n",
73-
"Here is the code to plot the partial dependence plot for pickup_longitude. Run the following cell."
75+
"Here is the code to plot the partial dependence plot for `pickup_longitude`. Run the following cell without changes."
7476
]
7577
},
7678
{
@@ -80,12 +82,10 @@
8082
"outputs": [],
8183
"source": [
8284
"from matplotlib import pyplot as plt\n",
83-
"from pdpbox import pdp, get_dataset, info_plots\n",
85+
"from sklearn.inspection import PartialDependenceDisplay\n",
8486
"\n",
8587
"feat_name = 'pickup_longitude'\n",
86-
"pdp_dist = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)\n",
87-
"\n",
88-
"pdp.pdp_plot(pdp_dist, feat_name)\n",
88+
"PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])\n",
8989
"plt.show()"
9090
]
9191
},
@@ -107,7 +107,6 @@
107107
"outputs": [],
108108
"source": [
109109
"for feat_name in base_features:\n",
110-
" pdp_dist = ____\n",
111110
" ____\n",
112111
" plt.show()"
113112
]
@@ -140,13 +139,13 @@
140139
"Now you will run a 2D partial dependence plot. As a reminder, here is the code from the tutorial. \n",
141140
"\n",
142141
"```\n",
143-
"inter1 = pdp.pdp_interact(model=my_model, dataset=val_X, model_features=feature_names, features=['Goal Scored', 'Distance Covered (Kms)'])\n",
144-
"\n",
145-
"pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=['Goal Scored', 'Distance Covered (Kms)'], plot_type='contour')\n",
142+
"fig, ax = plt.subplots(figsize=(8, 6))\n",
143+
"f_names = [('Goal Scored', 'Distance Covered (Kms)')]\n",
144+
"PartialDependenceDisplay.from_estimator(tree_model, val_X, f_names, ax=ax)\n",
146145
"plt.show()\n",
147146
"```\n",
148147
"\n",
149-
"Create a 2D plot for the features `pickup_longitude` and `dropoff_longitude`. Plot it appropriately?\n",
148+
"Create a 2D plot for the features `pickup_longitude` and `dropoff_longitude`.\n",
150149
"\n",
151150
"What do you expect it to look like?"
152151
]
@@ -157,6 +156,8 @@
157156
"metadata": {},
158157
"outputs": [],
159158
"source": [
159+
"fig, ax = plt.subplots(figsize=(8, 6))\n",
160+
"\n",
160161
"# Add your code here\n",
161162
"____"
162163
]
@@ -183,7 +184,7 @@
183184
"metadata": {},
184185
"source": [
185186
"## Question 3\n",
186-
"Consider a ride starting at longitude -73.92 and ending at longitude -74. Using the graph from the last question, estimate how much money the rider would have saved if they'd started the ride at longitude -73.98 instead?"
187+
"Consider a ride starting at longitude -73.955 and ending at longitude -74. Using the graph from the last question, estimate how much money the rider would have saved if they'd started the ride at longitude -73.98 instead."
187188
]
188189
},
189190
{
@@ -237,13 +238,10 @@
237238
"source": [
238239
"# This is the PDP for pickup_longitude without the absolute difference features. Included here to help compare it to the new PDP you create\n",
239240
"feat_name = 'pickup_longitude'\n",
240-
"pdp_dist_original = pdp.pdp_isolate(model=first_model, dataset=val_X, model_features=base_features, feature=feat_name)\n",
241-
"\n",
242-
"pdp.pdp_plot(pdp_dist_original, feat_name)\n",
241+
"PartialDependenceDisplay.from_estimator(first_model, val_X, [feat_name])\n",
243242
"plt.show()\n",
244243
"\n",
245-
"\n",
246-
"\n",
244+
"# Your code here\n",
247245
"# create new features\n",
248246
"data['abs_lon_change'] = ____\n",
249247
"data['abs_lat_change'] = ____\n",
@@ -260,9 +258,7 @@
260258
"second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)\n",
261259
"\n",
262260
"feat_name = 'pickup_longitude'\n",
263-
"pdp_dist = pdp.pdp_isolate(model=second_model, dataset=new_val_X, model_features=features_2, feature=feat_name)\n",
264-
"\n",
265-
"pdp.pdp_plot(pdp_dist, feat_name)\n",
261+
"disp = PartialDependenceDisplay.from_estimator(second_model, new_val_X, [feat_name])\n",
266262
"plt.show()\n",
267263
"\n",
268264
"# Check your answer\n",
@@ -273,7 +269,7 @@
273269
"cell_type": "markdown",
274270
"metadata": {},
275271
"source": [
276-
"Uncomment the lines below to see a hint or the solution (including an explanation of the important differences between the plots)."
272+
"Uncomment the line below to see a hint or the solution (including an explanation of the important differences between the plots)."
277273
]
278274
},
279275
{
@@ -339,19 +335,17 @@
339335
"# Create array holding predictive feature\n",
340336
"X1 = 4 * rand(n_samples) - 2\n",
341337
"X2 = 4 * rand(n_samples) - 2\n",
338+
"\n",
339+
"# Your code here\n",
342340
"# Create y. you should have X1 and X2 in the expression for y\n",
343341
"y = np.ones(n_samples)\n",
344342
"\n",
345-
"# create dataframe because pdp_isolate expects a dataFrame as an argument\n",
343+
"# create dataframe \n",
346344
"my_df = pd.DataFrame({'X1': X1, 'X2': X2, 'y': y})\n",
347345
"predictors_df = my_df.drop(['y'], axis=1)\n",
348346
"\n",
349347
"my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)\n",
350-
"\n",
351-
"pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')\n",
352-
"\n",
353-
"# visualize your results\n",
354-
"pdp.pdp_plot(pdp_dist, 'X1')\n",
348+
"disp = PartialDependenceDisplay.from_estimator(my_model, predictors_df, ['X1'])\n",
355349
"plt.show()\n",
356350
"\n",
357351
"# Check your answer\n",
@@ -410,8 +404,7 @@
410404
"my_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(predictors_df, my_df.y)\n",
411405
"\n",
412406
"\n",
413-
"pdp_dist = pdp.pdp_isolate(model=my_model, dataset=my_df, model_features=['X1', 'X2'], feature='X1')\n",
414-
"pdp.pdp_plot(pdp_dist, 'X1')\n",
407+
"disp = PartialDependenceDisplay.from_estimator(my_model, predictors_df, ['X1'], grid_resolution=300)\n",
415408
"plt.show()\n",
416409
"\n",
417410
"perm = PermutationImportance(my_model).fit(predictors_df, my_df.y)\n",
@@ -448,7 +441,7 @@
448441
],
449442
"metadata": {
450443
"kernelspec": {
451-
"display_name": "Python 3",
444+
"display_name": "Python 3 (ipykernel)",
452445
"language": "python",
453446
"name": "python3"
454447
},

notebooks/ml_explainability/raw/tut3_partial_plots.ipynb

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,7 @@
8888
"cell_type": "markdown",
8989
"metadata": {},
9090
"source": [
91-
"\n",
92-
"Here is the code to create the Partial Dependence Plot using the [PDPBox library](https://pdpbox.readthedocs.io/en/latest/)."
91+
"Here is the code to create the Partial Dependence Plot using the scikit-learn library."
9392
]
9493
},
9594
{
@@ -99,23 +98,18 @@
9998
"outputs": [],
10099
"source": [
101100
"from matplotlib import pyplot as plt\n",
102-
"from pdpbox import pdp, get_dataset, info_plots\n",
103-
"\n",
104-
"# Create the data that we will plot\n",
105-
"pdp_goals = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature='Goal Scored')\n",
101+
"from sklearn.inspection import PartialDependenceDisplay\n",
106102
"\n",
107-
"# plot it\n",
108-
"pdp.pdp_plot(pdp_goals, 'Goal Scored')\n",
103+
"# Create and plot the data\n",
104+
"disp1 = PartialDependenceDisplay.from_estimator(tree_model, val_X, ['Goal Scored'])\n",
109105
"plt.show()"
110106
]
111107
},
112108
{
113109
"cell_type": "markdown",
114110
"metadata": {},
115111
"source": [
116-
"A few items are worth pointing out as you interpret this plot\n",
117-
"- The y axis is interpreted as **change in the prediction** from what it would be predicted at the baseline or leftmost value.\n",
118-
"- A blue shaded area indicates level of confidence\n",
112+
"The y axis is interpreted as **change in the prediction** from what it would be predicted at the baseline or leftmost value.\n",
119113
"\n",
120114
"From this particular graph, we see that scoring a goal substantially increases your chances of winning \"Man of The Match.\" But extra goals beyond that appear to have little impact on predictions.\n",
121115
"\n",
@@ -129,9 +123,7 @@
129123
"outputs": [],
130124
"source": [
131125
"feature_to_plot = 'Distance Covered (Kms)'\n",
132-
"pdp_dist = pdp.pdp_isolate(model=tree_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)\n",
133-
"\n",
134-
"pdp.pdp_plot(pdp_dist, feature_to_plot)\n",
126+
"disp2 = PartialDependenceDisplay.from_estimator(tree_model, val_X, [feature_to_plot])\n",
135127
"plt.show()"
136128
]
137129
},
@@ -153,9 +145,7 @@
153145
"# Build Random Forest model\n",
154146
"rf_model = RandomForestClassifier(random_state=0).fit(train_X, train_y)\n",
155147
"\n",
156-
"pdp_dist = pdp.pdp_isolate(model=rf_model, dataset=val_X, model_features=feature_names, feature=feature_to_plot)\n",
157-
"\n",
158-
"pdp.pdp_plot(pdp_dist, feature_to_plot)\n",
148+
"disp3 = PartialDependenceDisplay.from_estimator(rf_model, val_X, [feature_to_plot])\n",
159149
"plt.show()"
160150
]
161151
},
@@ -179,11 +169,10 @@
179169
"metadata": {},
180170
"outputs": [],
181171
"source": [
182-
"# Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot\n",
183-
"features_to_plot = ['Goal Scored', 'Distance Covered (Kms)']\n",
184-
"inter1 = pdp.pdp_interact(model=tree_model, dataset=val_X, model_features=feature_names, features=features_to_plot)\n",
185-
"\n",
186-
"pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour')\n",
172+
"fig, ax = plt.subplots(figsize=(8, 6))\n",
173+
"f_names = [('Goal Scored', 'Distance Covered (Kms)')]\n",
174+
"# Similar to previous PDP plot except we use tuple of features instead of single feature\n",
175+
"disp4 = PartialDependenceDisplay.from_estimator(tree_model, val_X, f_names, ax=ax)\n",
187176
"plt.show()"
188177
]
189178
},
@@ -209,7 +198,7 @@
209198
],
210199
"metadata": {
211200
"kernelspec": {
212-
"display_name": "Python 3",
201+
"display_name": "Python 3 (ipykernel)",
213202
"language": "python",
214203
"name": "python3"
215204
},

0 commit comments

Comments
 (0)