from dsc80_utils import *

diabetes = pd.read_csv(Path('data') / 'diabetes.csv')
display_df(diabetes, cols=9)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=2, criterion='entropy')

dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

from sklearn.tree import plot_tree
plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=True);

dt_no_max = DecisionTreeClassifier()
dt_no_max.fit(X_train, y_train)

DecisionTreeClassifier()

dt_no_max.tree_.max_depth

22

dt_no_max.score(X_train, y_train)

0.9913194444444444

# Depth 2 tree.
dt.score(X_train, y_train)

0.765625

dt_no_max.score(X_test, y_test)

0.71875

# Depth 2 tree.
dt.score(X_test, y_test)

0.7760416666666666

fig

trees = {}
for d in [2, 4, 8]:
    trees[d] = DecisionTreeClassifier(max_depth=d, random_state=1)
    trees[d].fit(X_train, y_train)
    
    plt.figure(figsize=(15, 5), dpi=100)
    plot_tree(trees[d], feature_names=X_train.columns, class_names=['no db', 'yes db'], 
               filled=True, rounded=True, impurity=False)
    
    plt.show()

from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}

np.prod([len(v) for v in hyperparameters.values()])

140

searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)

%%time
searcher.fit(X_train, y_train)

CPU times: user 927 ms, sys: 12.3 ms, total: 939 ms
Wall time: 939 ms

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

searcher.cv_results_['mean_test_score'] # Array of length 140.

array([0.73, 0.73, 0.73, ..., 0.75, 0.74, 0.72])

# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

final_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

# Training accuracy.
final_tree.score(X_train, y_train)

0.7881944444444444

# Testing accuracy.
final_tree.score(X_test, y_test)

0.765625

searcher.score(X_train, y_train)

0.7881944444444444

searcher.score(X_test, y_test)

0.765625

X_train, X_test, y_train, y_test = (
    train_test_split(diabetes.drop(columns=['Outcome']), diabetes['Outcome'], random_state=1)
)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

rf.score(X_train, y_train)

1.0

# Note that because random forests are _random_,
# if you re-train the model above, the test accuracy
# you get will be slightly different every time!
rf.score(X_test, y_test)

0.8072916666666666

dt = DecisionTreeClassifier(max_depth=4, criterion='entropy')
dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

dt.score(X_train, y_train)

0.7829861111111112

dt.score(X_test, y_test)

0.7395833333333334

news = pd.read_csv(Path('data') / 'fake_news_training.csv')
news

news['label'].value_counts(normalize=True)

real    0.55
fake    0.45
Name: label, dtype: float64

from sklearn.feature_extraction.text import CountVectorizer

example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']

count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()

count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}

count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])

example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']

pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X = news['content']
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(
        max_depth=3,
        n_estimators=100, # Uses 100 separate decision trees!
        random_state=42,
    )) 
])

pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

# Training accuracy.
pl.score(X_train, y_train)

0.7636363636363637

# Testing accuracy.
pl.score(X_test, y_test)

0.6927710843373494

y_train.value_counts(normalize=True)

real    0.55
fake    0.45
Name: label, dtype: float64

# Distribution of predicted ys in the training set:
pd.set_option('display.float_format', '{:.3f}'.format) # Stops scientific notation for pandas.
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

fake   0.634
real   0.366
dtype: float64

len(pl.named_steps['cv'].vocabulary_) # Lots of features!

25500

# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.

hyperparameters = {
    'clf__max_depth': np.arange(2, 200, 20)
}

%%time

# Takes a few seconds to run – how many trees are being trained?
from sklearn.model_selection import GridSearchCV
grids = GridSearchCV(
    pl,
    n_jobs=-1, # Use multiple processors to parallelize.
    param_grid=hyperparameters,
    return_train_score=True
)
grids.fit(X_train, y_train)

CPU times: user 617 ms, sys: 205 ms, total: 823 ms
Wall time: 5.88 s

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

grids.best_params_

{'clf__max_depth': 42}

# Training accuracy.
grids.score(X_train, y_train)

0.9919191919191919

# Testing accuracy.
grids.score(X_test, y_test)

0.8614457831325302

index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']

pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
    xaxis_title='max_depth', yaxis_title='Accuracy'
)

	baseurl	content	label
0	twitter.com	\njavascript is not available.\n\nwe’ve detect...	real
1	whitehouse.gov	remarks by the president at campaign event -- ...	real
2	web.archive.org	the committee on energy and commerce\nbarton: ...	real
...	...	...	...
658	politico.com	full text: jeff flake on trump speech transcri...	fake
659	pol.moveon.org	moveon.org political action: 10 things to know...	real
660	uspostman.com	uspostman.com is for sale\nyes, you can transf...	fake

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

	0	1	2	3	...	136	137	138	139
0	0.71	0.71	0.71	0.71	...	0.70	0.68	0.71	0.73
1	0.77	0.77	0.77	0.77	...	0.82	0.83	0.77	0.76
2	0.74	0.74	0.74	0.74	...	0.68	0.72	0.74	0.73
3	0.70	0.70	0.70	0.70	...	0.77	0.79	0.76	0.70
4	0.72	0.72	0.72	0.72	...	0.70	0.71	0.72	0.70

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

Lecture 17 – Decision Trees and Random Forests¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Decision trees¶

Example: Predicting diabetes¶

Visualizing decision trees¶

Exercise

Tree depth¶

Decision trees and overfitting¶

Hyperparameters for decision trees¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Question 🤔 (Answer at q.dsc80.com)

Random forests¶

Recap: Decision trees¶

Another idea for reducing decision tree variance¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Example: Predicting diabetes¶

Question 🤔 (Answer at q.dsc80.com)

Modeling using text features¶

Example: Fake news¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Question 🤔 (Answer at q.dsc80.com)

Classifier evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Exercise

Summary, next time¶

Summary¶

Next time¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶