from dsc80_utils import *

diabetes = pd.read_csv(Path('data') / 'diabetes.csv')
display_df(diabetes, cols=9)

# 0 means no diabetes, 1 means yes diabetes.
diabetes['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2, criterion='entropy')

dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

from sklearn.tree import plot_tree

plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=False);

# Note that the left node at depth 2 has a `value` of [304, 78].
y_train[X_train.query('Glucose <= 129.5').index].value_counts()

Outcome
0    304
1     78
Name: count, dtype: int64

(dt.predict(X_train) == y_train).mean()

np.float64(0.765625)

# Training accuracy – same number as above
dt.score(X_train, y_train)

0.765625

# Testing accuracy
dt.score(X_test, y_test)

0.7760416666666666

def make_tree(X, y):
    if all points in y have the same label C:
        return Leaf(C)
    f = best splitting feature # e.g. Glucose or BMI
    v = best splitting value   # e.g. 129.5
    
    X_left, y_left   = X, y where (X[f] <= v)
    X_right, y_right = X, y where (X[f] > v)
    
    left  = make_tree(X_left, y_left)
    right = make_tree(X_right, y_right)
    
    return Node(f, v, left, right)
    
make_tree(X_train, y_train)

def entropy(node):
    props = pd.Series(list(node)).value_counts(normalize=True)
    return -sum(props * np.log2(props))

def weighted_entropy(yes_node, no_node):
    yes_entropy = entropy(yes_node)
    no_entropy = entropy(no_node)
    yes_weight = len(yes_node) / (len(yes_node) + len(no_node))
    return yes_weight * yes_entropy + (1 - yes_weight) * no_entropy

# Split A:
weighted_entropy("🟠🟠🟠🟠🟠🟠🔵", "🟠🟠🟠🟠🟠🟠🔵🔵🔵🔵🔵")

0.8375578764623786

# Split B:
weighted_entropy("🟠🟠🟠🟠🟠🟠🔵🔵🔵", "🟠🟠🟠🟠🟠🟠🔵🔵🔵")

0.9182958340544896

plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=True);

# The first node at depth 2 has an entropy of 0.73,
# both told to us above and verified here!
entropy([0] * 304 + [1] * 78)

0.7302263747422792

dt_no_max = DecisionTreeClassifier()
dt_no_max.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

dt_no_max.tree_.max_depth

21

dt_no_max.score(X_train, y_train)

0.9913194444444444

# Depth 2 tree.
dt.score(X_train, y_train)

0.765625

dt_no_max.score(X_test, y_test)

0.7135416666666666

# Depth 2 tree.
dt.score(X_test, y_test)

0.7760416666666666

fig

trees = {}
for d in [2, 4, 8]:
    trees[d] = DecisionTreeClassifier(max_depth=d, random_state=1)
    trees[d].fit(X_train, y_train)
    
    plt.figure(figsize=(15, 5), dpi=100)
    plot_tree(trees[d], feature_names=X_train.columns, class_names=['no db', 'yes db'], 
               filled=True, rounded=True, impurity=False)
    
    plt.show()

from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}

len(hyperparameters['max_depth']) * \
len(hyperparameters['min_samples_split']) * \
len(hyperparameters['criterion'])

140

searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)

%%time
searcher.fit(X_train, y_train)

CPU times: user 853 ms, sys: 2.17 ms, total: 855 ms
Wall time: 858 ms

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

searcher.cv_results_['mean_test_score'] # Array of length 140.

array([0.73, 0.73, 0.73, ..., 0.75, 0.74, 0.72])

# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

final_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

# Training accuracy.
final_tree.score(X_train, y_train)

0.7881944444444444

# Testing accuracy.
# A bit lower than the `dt` tree we fit above!
final_tree.score(X_test, y_test)

0.765625

searcher.score(X_train, y_train)

0.7881944444444444

searcher.score(X_test, y_test)

0.765625

# Let's use more features for prediction
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes.drop(columns=['Outcome']), diabetes['Outcome'], random_state=1)
)

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

clf.score(X_test, y_test)

0.8125

dt = DecisionTreeClassifier(max_depth=4, criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_train, y_train)

0.7829861111111112

dt.score(X_test, y_test)

0.734375

news = pd.read_csv('data/fake_news_training.csv')
news

news['label'].value_counts(normalize=True)

label
real    0.55
fake    0.45
Name: proportion, dtype: float64

from sklearn.feature_extraction.text import CountVectorizer

example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']

count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()

CountVectorizer()

count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}

count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])

example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']

pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X = news['content']
y = news['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)

pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(
        max_depth=3,
        n_estimators=100, # Uses 100 separate decision trees!
        random_state=42,
    )) 
])

pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=3, random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=3, random_state=42)

# Training accuracy.
pl.score(X_train, y_train)

0.7616161616161616

# Testing accuracy.
pl.score(X_test, y_test)

0.6807228915662651

y_train.value_counts(normalize=True)

label
real    0.54
fake    0.46
Name: proportion, dtype: float64

# Distribution of predicted ys in the training set:

# stops scientific notation for pandas
pd.set_option('display.float_format', '{:.3f}'.format)
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

fake   0.681
real   0.319
Name: proportion, dtype: float64

len(pl.named_steps['cv'].vocabulary_) # Lots of features!

23079

# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.

hyperparameters = {
    'clf__max_depth': np.arange(2, 200, 20)
}

%%time

# Takes a few seconds to run – how many trees are being trained?
from sklearn.model_selection import GridSearchCV
grids = GridSearchCV(
    pl,
    n_jobs=-1, # Use multiple processors to parallelize
    param_grid=hyperparameters,
    return_train_score=True
)
grids.fit(X_train, y_train)

CPU times: user 1.18 s, sys: 255 ms, total: 1.44 s
Wall time: 6.4 s

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=3,
                                                               random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182])},
             return_train_score=True)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf',
                 RandomForestClassifier(max_depth=np.int64(42),
                                        random_state=42))])

CountVectorizer()

RandomForestClassifier(max_depth=np.int64(42), random_state=42)

grids.best_params_

{'clf__max_depth': np.int64(42)}

# Training accuracy.
grids.score(X_train, y_train)

0.9959595959595959

# Testing accuracy.
grids.score(X_test, y_test)

0.8373493975903614

index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']

pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
    xaxis_title='max_depth', yaxis_title='Accuracy'
)

hyperparameters = {
    'n_estimators': [10, 100, 1000], # number of trees per forest
    'max_depth': [None, 100, 10]     # max depth of each tree
}
grids = GridSearchCV(
    RandomForestClassifier(), param_grid=hyperparameters,
    cv=3, # 3-fold cross-validation
)
grids.fit(X_train, y_train)

	baseurl	content	label
0	twitter.com	\njavascript is not available.\n\nwe’ve detect...	real
1	whitehouse.gov	remarks by the president at campaign event -- ...	real
2	web.archive.org	the committee on energy and commerce\nbarton: ...	real
...	...	...	...
658	politico.com	full text: jeff flake on trump speech transcri...	fake
659	pol.moveon.org	moveon.org political action: 10 things to know...	real
660	uspostman.com	uspostman.com is for sale\nyes, you can transf...	fake

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

	0	1	2	3	...	136	137	138	139
0	0.71	0.71	0.71	0.71	...	0.67	0.70	0.71	0.73
1	0.77	0.77	0.77	0.77	...	0.82	0.83	0.77	0.76
2	0.74	0.74	0.74	0.74	...	0.68	0.72	0.74	0.73
3	0.70	0.70	0.70	0.70	...	0.77	0.79	0.76	0.70
4	0.72	0.72	0.72	0.72	...	0.70	0.71	0.72	0.70

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

Lecture 17 – Random Forests, Classifier Evaluation¶

DSC 80, Fall 2024¶

Announcements 📣¶

Agenda¶

Generalization¶

Question 🤔 (Answer at dsc80.com/q)

Summary: Generalization¶

Decision trees 🌲¶

Example: Should I get groceries?¶

Example: Predicting diabetes¶

Exploring the dataset¶

Building a decision tree¶

Visualizing decision trees¶

Evaluating classifiers¶

Reflection¶

How are decision trees trained?¶

How do we measure the quality of a split?¶

Entropy¶

Example entropy calculation¶

Understanding entropy¶

Tree depth¶

Decision trees and overfitting¶

Hyperparameters for decision trees¶

Thinking about bias and variance¶

Question 🤔 (Answer at dsc80.com/q)

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Decision tree pros and cons¶

Random Forests¶

Another idea:¶

Idea 1: Bootstrap the training data¶

Idea 2: Only use a subset of features¶

Question 🤔 (Answer at dsc80.com/q)

Example¶

Example: Modeling using text features¶

Example: Fake news¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Question 🤔 (Answer at dsc80.com/q)

Classifier Evaluation¶

Accuracy isn't everything!¶

The Boy Who Cried Wolf 👦😭🐺¶

The wolf classifier¶

The wolf classifier¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Precision and recall¶

Question 🤔 (Answer at dsc80.com/q)

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶