import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'
import warnings
warnings.simplefilter('ignore')
diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
fig = (
diabetes.assign(Outcome=diabetes['Outcome'].astype(str))
.plot(kind='scatter', x='Glucose', y='BMI', color='Outcome',
color_discrete_map={'0': 'orange', '1': 'blue'},
title='Relationship between Glucose, BMI, and Diabetes',
template=TEMPLATE)
)
fig
Recall, we started with a relatively simple decision tree.
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
X_train, X_test, y_train, y_test = train_test_split(diabetes[['Glucose', 'BMI']],
diabetes['Outcome'],
random_state=1)
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=2)
plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'],
filled=True, rounded=True, fontsize=15, impurity=False);
Create a DecisionTreeClassifier
that
max_depth
, min_samples_split
, criterion
) that maximizes average validation accuracy.GridSearchCV
takes in:
fit
instance of an estimator, andand performs $k$-fold cross-validation to find the combination of hyperparameters with the best average validation performance.
from sklearn.model_selection import GridSearchCV
The following dictionary contains the values we're considering for each hyperparameter. (We're using GridSearchCV
with 3 hyperparameters, but we could use it with even just a single hyperparameter.)
hyperparameters = {
'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
'criterion': ['gini', 'entropy']
}
Note that there are 140 combinations of hyperparameters we need to try. We need to find the best combination of hyperparameters, not the best value for each hyperparameter individually.
len(hyperparameters['max_depth']) * \
len(hyperparameters['min_samples_split']) * \
len(hyperparameters['criterion'])
140
GridSearchCV
needs to be instantiated and fit
.
searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)
searcher.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})
After being fit
, the best_params_
attribute provides us with the best combination of hyperparameters to use.
searcher.best_params_
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 50}
All of the intermediate results – validation accuracies for each fold, mean validation accuaries, etc. – are stored in the cv_results_
attribute:
searcher.cv_results_['mean_test_score'] # Array of length 140.
array([0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 , 0.72568216, 0.74136432, 0.74136432, 0.74136432, 0.74136432, 0.74484258, 0.73964018, 0.72568216, 0.75005997, 0.74832084, 0.75005997, 0.7517991 , 0.7517991 , 0.7465967 , 0.72568216, 0.73613193, 0.73613193, 0.73961019, 0.73961019, 0.74832084, 0.7465967 , 0.72568216, 0.71364318, 0.72058471, 0.72575712, 0.73617691, 0.7413943 , 0.7465967 , 0.72568216, 0.71185907, 0.70487256, 0.71530735, 0.7292054 , 0.73968516, 0.7465967 , 0.72568216, 0.68929535, 0.69794603, 0.70832084, 0.71878561, 0.73965517, 0.7465967 , 0.72568216, 0.7014093 , 0.69971514, 0.70662669, 0.71709145, 0.73794603, 0.7465967 , 0.72568216, 0.69277361, 0.69623688, 0.71527736, 0.71361319, 0.73965517, 0.7465967 , 0.72568216, 0.69107946, 0.69451274, 0.71358321, 0.72226387, 0.73965517, 0.7465967 , 0.72568216, 0.72398801, 0.72398801, 0.72398801, 0.72398801, 0.72398801, 0.72398801, 0.72394303, 0.74658171, 0.74658171, 0.74658171, 0.74658171, 0.74658171, 0.73616192, 0.72394303, 0.7517991 , 0.7517991 , 0.75005997, 0.75005997, 0.75005997, 0.73964018, 0.72394303, 0.75178411, 0.75178411, 0.75178411, 0.75178411, 0.75353823, 0.73964018, 0.72394303, 0.73268366, 0.73442279, 0.74310345, 0.74656672, 0.75353823, 0.73964018, 0.72394303, 0.71883058, 0.71533733, 0.73965517, 0.7413943 , 0.75182909, 0.73964018, 0.72394303, 0.68587706, 0.69967016, 0.72226387, 0.73791604, 0.75008996, 0.73964018, 0.72394303, 0.6962069 , 0.6961919 , 0.71355322, 0.73269865, 0.74664168, 0.73964018, 0.72394303, 0.67889055, 0.68407796, 0.70491754, 0.73269865, 0.75008996, 0.73964018, 0.72394303, 0.67716642, 0.68928036, 0.71361319, 0.72752624, 0.75008996, 0.73964018, 0.72394303])
# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.706897 | 0.706897 | 0.706897 | 0.706897 | 0.706897 | 0.706897 | 0.732759 | 0.715517 | 0.715517 | 0.715517 | ... | 0.698276 | 0.706897 | 0.732759 | 0.629310 | 0.663793 | 0.672414 | 0.672414 | 0.698276 | 0.706897 | 0.732759 |
1 | 0.773913 | 0.773913 | 0.773913 | 0.773913 | 0.773913 | 0.773913 | 0.756522 | 0.756522 | 0.756522 | 0.756522 | ... | 0.826087 | 0.773913 | 0.756522 | 0.704348 | 0.678261 | 0.765217 | 0.817391 | 0.826087 | 0.773913 | 0.756522 |
2 | 0.739130 | 0.739130 | 0.739130 | 0.739130 | 0.739130 | 0.739130 | 0.730435 | 0.756522 | 0.756522 | 0.756522 | ... | 0.721739 | 0.739130 | 0.730435 | 0.643478 | 0.678261 | 0.695652 | 0.678261 | 0.721739 | 0.739130 | 0.730435 |
3 | 0.704348 | 0.704348 | 0.704348 | 0.704348 | 0.704348 | 0.704348 | 0.704348 | 0.756522 | 0.756522 | 0.756522 | ... | 0.791304 | 0.756522 | 0.695652 | 0.747826 | 0.730435 | 0.730435 | 0.765217 | 0.791304 | 0.756522 | 0.695652 |
4 | 0.721739 | 0.721739 | 0.721739 | 0.721739 | 0.721739 | 0.721739 | 0.704348 | 0.721739 | 0.721739 | 0.721739 | ... | 0.713043 | 0.721739 | 0.704348 | 0.660870 | 0.695652 | 0.704348 | 0.704348 | 0.713043 | 0.721739 | 0.704348 |
5 rows × 140 columns
Note that the above DataFrame tells us that 5 * 140 = 700 models were trained in total!
Now that we've found the best combination of hyperparameters, we should fit a decision tree instance using those hyperparameters on our entire training set.
searcher.best_params_
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 50}
final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree
DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=50)
final_tree.fit(X_train, y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=50)
# Training accuracy.
final_tree.score(X_train, y_train)
0.7829861111111112
# Testing accuracy.
final_tree.score(X_test, y_test)
0.765625
Remember, searcher
itself is a model object (we had to fit
it). After performing $k$-fold cross-validation, behind the scenes, searcher
is trained on the entire training set using the optimal combination of hyperparameters.
In other words, searcher
makes the same predictions that final_tree
does!
searcher.score(X_train, y_train)
0.7829861111111112
searcher.score(X_test, y_test)
0.765625
max_depth
s from 32 to 128, and 32 was the best, try including max_depth
s under 32.GridSearchCV
.GridSearchCV
will try all unique combinations of hyperparameters and return the combination with the best average validation performance.GridSearchCV
is not the only solution – see RandomizedSearchCV
if you're curious.people = pd.read_csv('data/SOCR-HeightWeight.csv').drop('Index', axis=1)
people.head()
Height (Inches) | Weight (Pounds) | |
---|---|---|
0 | 65.78331 | 112.9925 |
1 | 71.51521 | 136.4873 |
2 | 69.39874 | 153.0269 |
3 | 68.21660 | 142.3354 |
4 | 67.78781 | 144.2971 |
people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)',
title='Weight vs. Height for 25,000 18 Year Olds', template=TEMPLATE)
Suppose we fit a simple linear regression model that uses height in inches to predict weight in pounds.
$$\text{predicted weight (pounds)} = w_0 + w_1 \cdot \text{height (inches)}$$from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(people[['Height (Inches)']],
people['Weight (Pounds)'],
random_state=1)
lr_one_feat = LinearRegression()
lr_one_feat.fit(X_train_1, y_train_1)
LinearRegression()
$w_0^*$ and $w_1^*$ are shown below, along with the model's testing RMSE.
lr_one_feat.intercept_, lr_one_feat.coef_
(-81.18621383194517, array([3.06263359]))
rmse_one_feat = mean_squared_error(y_test_1,
lr_one_feat.predict(X_test_1),
squared=False)
rmse_one_feat
10.044723811394737
Now, suppose we fit another regression model, that uses height in inches AND height in centimeters to predict weight.
$$\text{predicted weight (pounds)} = w_0 + w_1 \cdot \text{height (inches)} + w_2 \cdot \text{height (cm)}$$people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm.
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(people[['Height (Inches)', 'Height (cm)']],
people['Weight (Pounds)'],
random_state=1)
lr_two_feat = LinearRegression()
lr_two_feat.fit(X_train_2, y_train_2)
LinearRegression()
What are $w_0^*$, $w_1^*$, $w_2^*$, and the model's testing RMSE?
lr_two_feat.intercept_, lr_two_feat.coef_
(-81.16135662853335, array([ 7.41066921e+11, -2.91758630e+11]))
rmse_two_feat = mean_squared_error(y_test_2,
lr_two_feat.predict(X_test_2),
squared=False)
rmse_two_feat
10.044966596201823
Observation: The intercept is the same as before (roughly -81.17), as is the testing RMSE. However, the coefficients on 'Height (Inches)'
and 'Height (cm)'
are massive in size!
What's going on?
Let's use simpler numbers for illustration. Suppose in the first model, $w_0^* = -80$ and $w_1^* = 3$.
In the second model, we have:
$$\begin{align*}\text{predicted weight (pounds)} &= w_0^* + w_1^* \cdot \text{height (inches)} + w_2^* \cdot \text{height (cm)} \\ &= w_0^* + w_1^* \cdot \text{height (inches)} + w_2^* \cdot \big( 2.54^* \cdot \text{height (inches)} \big) \\ &= w_0^* + \left(w_1^* + 2.54 \cdot w_2^* \right) \cdot \text{height (inches)} \end{align*}$$In the first model, we already found the "best" intercept ($-80$) and slope ($3$) in a linear model that uses height in inches to predict weight.
So, as long as $w_1^* + 2.54 \cdot w_2^* = 3$ in the second model, the second model's training predictions will be the same as the first, and hence they will also minimize RMSE.
Issue: There are an infinite number of $w_1^*$ and $w_2^*$ values that satisfy $w_1^* + 2.54 \cdot w_2^* = 3$!
lr.coef_
could return either set of coefficients, or any other of the infinitely many options.(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()
0 117.34993 1 134.54563 2 128.19622 3 124.64980 4 123.36343 dtype: float64
(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()
0 117.34993 1 134.54563 2 128.19622 3 124.64980 4 123.36343 dtype: float64
When we one hot encode categorical features, we create several redundant columns.
tips = px.data.tips()
tips_features = tips.drop('tip', axis=1)
tips_features.head()
total_bill | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|
0 | 16.99 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | Female | No | Sun | Dinner | 4 |
Aside: You can use pd.get_dummies
in EDA, but don't use it for modeling (instead, use OneHotEncoder
, which works with Pipeline
s).
X = pd.get_dummies(tips_features)
X.head()
total_bill | size | sex_Female | sex_Male | smoker_No | smoker_Yes | day_Fri | day_Sat | day_Sun | day_Thur | time_Dinner | time_Lunch | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16.99 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
1 | 10.34 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
2 | 21.01 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
3 | 23.68 | 2 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
4 | 24.59 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
Remember that under the hood, LinearRegression()
creates a design matrix that has a column of all ones (for the intercept term). Let's add that column above for demonstration.
X['all_ones'] = 1
X.head()
total_bill | size | sex_Female | sex_Male | smoker_No | smoker_Yes | day_Fri | day_Sat | day_Sun | day_Thur | time_Dinner | time_Lunch | all_ones | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 16.99 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
1 | 10.34 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
2 | 21.01 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
3 | 23.68 | 2 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
4 | 24.59 | 4 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
Now, many of the above columns can be written as linear combinations of other columns!
'sex_Male'
– its value is just 'all_ones'
- 'sex_Female'
.'smoker_Yes'
– its value is just 'all_ones'
- 'smoker_No'
.'time_Lunch'
– its value is just 'all_ones'
- 'time_Dinner'
.'day_Thur'
– its value is just 'all_ones'
- ('day_Fri'
+ 'day_Sat'
+ 'day_Sun'
).Note that if we get rid of the four redundant columns above, the rank of our design matrix – that is, the number of linearly independent columns it has – does not change (and so the "predictive power" of our features don't change either).
np.linalg.matrix_rank(X)
9
np.linalg.matrix_rank(X.drop(columns=['sex_Male', 'smoker_Yes', 'time_Lunch', 'day_Thur']))
9
However, without the redundant columns, there is only a single unique set of optimal parameters $w^*$, and the multicollinearity is no more.
Aside: Most one hot encoding techniques (including OneHotEncoder
) have an in-built drop
argument, which allow you to specify that you'd like to drop one column per categorical feature.
pd.get_dummies(tips_features, drop_first=True)
total_bill | size | sex_Male | smoker_Yes | day_Sat | day_Sun | day_Thur | time_Lunch | |
---|---|---|---|---|---|---|---|---|
0 | 16.99 | 2 | 0 | 0 | 0 | 1 | 0 | 0 |
1 | 10.34 | 3 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | 21.01 | 3 | 1 | 0 | 0 | 1 | 0 | 0 |
3 | 23.68 | 2 | 1 | 0 | 0 | 1 | 0 | 0 |
4 | 24.59 | 4 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 3 | 1 | 0 | 1 | 0 | 0 | 0 |
240 | 27.18 | 2 | 0 | 1 | 1 | 0 | 0 | 0 |
241 | 22.67 | 2 | 1 | 1 | 1 | 0 | 0 | 0 |
242 | 17.82 | 2 | 1 | 0 | 1 | 0 | 0 | 0 |
243 | 18.78 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
244 rows × 8 columns
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first')
ohe.fit_transform(tips_features[['sex', 'smoker', 'day', 'time']]).toarray()
array([[0., 0., 0., 1., 0., 0.], [1., 0., 0., 1., 0., 0.], [1., 0., 0., 1., 0., 0.], ..., [1., 1., 1., 0., 0., 0.], [1., 0., 1., 0., 0., 0.], [0., 0., 0., 0., 1., 0.]])
The above array only has $(2-1) + (2-1) + (4-1) + (2-1) = 6$ columns, rather than $2 + 2 + 4 + 2 = 10$, since we dropped 1 per categorical column in tips_features
.
reviews = pd.read_json(open('data/reviews.json'), lines=True)
reviews.head()
reviewerID | asin | reviewerName | helpful | reviewText | overall | summary | unixReviewTime | reviewTime | |
---|---|---|---|---|---|---|---|---|---|
0 | A1JZFGZEZVWQPY | B00002N674 | Carter H "1amazonreviewer@gmail . com" | [4, 4] | Good USA company that stands behind their prod... | 4 | Great Hoses | 1308614400 | 06 21, 2011 |
1 | A32JCI4AK2JTTG | B00002N674 | Darryl Bennett "Fuzzy342" | [0, 0] | This is a high quality 8 ply hose. I have had ... | 5 | Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ... | 1402272000 | 06 9, 2014 |
2 | A3N0P5AAMP6XD2 | B00002N674 | H B | [2, 3] | It's probably one of the best hoses I've ever ... | 4 | Very satisfied! | 1336176000 | 05 5, 2012 |
3 | A2QK7UNJ857YG | B00002N674 | Jason | [0, 0] | I probably should have bought something a bit ... | 5 | Very high quality | 1373846400 | 07 15, 2013 |
4 | AS0CYBAN6EM06 | B00002N674 | jimmy | [1, 1] | I bought three of these 5/8-inch Flexogen hose... | 5 | Good Hoses | 1375660800 | 08 5, 2013 |
Goal: Use a review's 'summary'
to predict its 'overall'
rating.
Note that there are five possible 'overall'
rating values – 1, 2, 3, 4, 5 – not just two. As such, this is an instance of multiclass classification.
reviews['overall'].value_counts(normalize=True)
5 0.530214 4 0.254973 3 0.125000 2 0.050708 1 0.039105 Name: overall, dtype: float64
Question: What is the worst possible accuracy we should expect from a ratings classifier, given the above distribution?
CountVectorizer
¶Entries in the 'summary'
column are not currently quantitative! We can use the bag of words encoding to create quantitative features out of each 'summary'
.
Instead of performing a bag of words encoding manually as we did before, we can rely on sklearn
's CountVectorizer
. (There is also a TfidfVectorizer
.)
from sklearn.feature_extraction.text import CountVectorizer
example_corp = ['hey hey hey my name is billy',
'hey billy how is your dog billy']
count_vec = CountVectorizer()
count_vec.fit(example_corp)
CountVectorizer()
count_vec
learned a vocabulary from the corpus we fit
it on.
count_vec.vocabulary_
{'hey': 2, 'my': 5, 'name': 6, 'is': 4, 'billy': 0, 'how': 3, 'your': 7, 'dog': 1}
count_vec.transform(example_corp).toarray()
array([[1, 0, 3, 0, 1, 1, 1, 0], [2, 1, 1, 1, 1, 0, 0, 1]])
Note that the values in count_vec.vocabulary_
correspond to the positions of the columns in count_vec.transform(example_corp).toarray()
, i.e. 'billy'
is the first column and 'your'
is the last column.
example_corp
['hey hey hey my name is billy', 'hey billy how is your dog billy']
pd.DataFrame(count_vec.transform(example_corp).toarray(),
columns=pd.Series(count_vec.vocabulary_).sort_values().index)
billy | dog | hey | how | is | my | name | your | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 1 | 1 | 1 | 0 |
1 | 2 | 1 | 1 | 1 | 1 | 0 | 0 | 1 |
Pipeline
¶Let's build a Pipeline
that takes in summaries and overall ratings and:
CountVectorizer
to quantitatively encode summaries.RandomForestClassifier
to the data.But first, a train-test split (like always).
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
X = reviews['summary']
y = reviews['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y)
To start, we'll create a random forest with 7 trees (n_estimators
) each of which has a maximum depth of 8 (max_depth
).
pl = Pipeline([
('cv', CountVectorizer()),
('clf', RandomForestClassifier(max_depth=8, n_estimators=7)) # Uses 7 separate decision trees!
])
pl.fit(X_train, y_train)
Pipeline(steps=[('cv', CountVectorizer()), ('clf', RandomForestClassifier(max_depth=8, n_estimators=7))])
# Training accuracy.
pl.score(X_train, y_train)
0.5340566606389391
# Testing accuracy.
pl.score(X_test, y_test)
0.5394816154309825
The accuracy of our random forest is just above 50%, on both the training and testing sets. We'd get the same performance by predicting a rating of 5 every time!
# Distribution of true ys in the training set: 53% are 5.
y_train.value_counts(normalize=True)
5 0.527928 4 0.257284 3 0.126482 2 0.051135 1 0.037171 Name: overall, dtype: float64
# Distribution of predicted ys in the training set: 99.8% are 5.
# It turns out we essentially are predicting 5 every time!
pd.Series(pl.predict(X_train)).value_counts(normalize=True)
5 0.987241 4 0.010749 3 0.001808 2 0.000201 dtype: float64
len(pl.named_steps['cv'].vocabulary_) # We have many features, but we are not asking many questions!
5277
GridSearchCV
¶We arbitrarily chose max_depth=8
before, but it seems like that isn't working well. Let's perform a grid search to find the max_depth
with the best generalization performance.
# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.
hyperparameters = {
'clf__max_depth': np.arange(2, 500, 20)
}
Note that while pl
has already been fit
, we can still give it to GridSearchCV
, which will repeatedly re-fit
it during cross-validation.
# Takes 10+ seconds to run – how many trees are being trained?
grids = GridSearchCV(pl, param_grid=hyperparameters, return_train_score=True)
grids.fit(X_train, y_train)
GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()), ('clf', RandomForestClassifier(max_depth=8, n_estimators=7))]), param_grid={'clf__max_depth': array([ 2, 22, 42, 62, 82, 102, 122, 142, 162, 182, 202, 222, 242, 262, 282, 302, 322, 342, 362, 382, 402, 422, 442, 462, 482])}, return_train_score=True)
grids.best_params_
{'clf__max_depth': 122}
Recall, fit
GridSearchCV
objects are estimators on their own as well. This means we can compute the training and testing accuracies of the "best" random forest directly:
# Training accuracy.
grids.score(X_train, y_train)
0.8022905364677516
# Testing accuracy.
grids.score(X_test, y_test)
0.5898131404460518
Still not much better on the testing set! 🤷
Below, we plot how training and validation accuracy varied with tree depth. Note that the $y$-axis here is accuracy, and that larger accuracies are better (unlike with RMSE, where smaller was better).
index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']
pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
xaxis_title='max_depth', yaxis_title='Accuracy'
)
Unsurprisingly, training accuracy kept increasing, while validation accuracy leveled off around a depth of ~100.
CountVectorizer
transformer can be used to perform the bag of words encoding.Metrics for measuring the performance of classifiers other than accuracy.