from dsc80_utils import *

import lec16_util as util

np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()

# Look at the definition of train_and_plot in lec15_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

diabetes = pd.read_csv('data/diabetes.csv')
display_df(diabetes, cols=9)

# 0 means no diabetes, 1 means yes diabetes.
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

fig = (
    X_train.assign(Outcome=y_train.astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
)
fig

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=2, criterion='entropy')

dt.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=2)

from sklearn.tree import plot_tree

plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=15, impurity=False);

# Note that the left node at depth 2 has a `value` of [304, 78].
y_train[X_train.query('Glucose <= 129.5').index].value_counts()

0    304
1     78
Name: Outcome, dtype: int64

(dt.predict(X_train) == y_train).mean()

0.765625

# Training accuracy – same number as above
dt.score(X_train, y_train)

0.765625

# Testing accuracy
dt.score(X_test, y_test)

0.7760416666666666

def make_tree(X, y):
    if all points in y have the same label C:
        return Leaf(C)
    f = best splitting feature # e.g. Glucose or BMI
    v = best splitting value   # e.g. 129.5

    X_left, y_left   = X, y where (X[f] < v)
    X_right, y_right = X, y where (X[f] >= v)

    left  = make_tree(X_left, y_left)
    right = make_tree(X_right, y_right)

    return Node(f, v, left, right)

make_tree(X_train, y_train)

def entropy(labels):
    props = pd.Series(list(labels)).value_counts() / len(labels)
    return -sum(props * np.log2(props))

split_a = entropy("🟠🟠🟠🟠🟠🟠🔵") + entropy("🟠🟠🟠🟠🟠🟠🔵🔵🔵🔵🔵")
split_b = entropy("🟠🟠🟠🟠🟠🟠🔵🔵🔵") + entropy("🟠🟠🟠🟠🟠🟠🔵🔵🔵")
split_a, split_b

(1.5857029900592838, 1.8365916681089791)

dt_no_max = DecisionTreeClassifier()
dt_no_max.fit(X_train, y_train)

DecisionTreeClassifier()

dt_no_max.tree_.max_depth

22

dt_no_max.score(X_train, y_train)

0.9913194444444444

# Depth 2 tree.
dt.score(X_train, y_train)

0.765625

dt_no_max.score(X_test, y_test)

0.71875

# Depth 2 tree.
dt.score(X_test, y_test)

0.7760416666666666

fig

trees = {}
for d in [2, 4, 8]:
    trees[d] = DecisionTreeClassifier(max_depth=d, random_state=1)
    trees[d].fit(X_train, y_train)
    
    plt.figure(figsize=(15, 5), dpi=100)
    plot_tree(trees[d], feature_names=X_train.columns, class_names=['no db', 'yes db'], 
               filled=True, rounded=True, impurity=False)
    
    plt.show()

from sklearn.model_selection import GridSearchCV

hyperparameters = {
    'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}

len(hyperparameters['max_depth']) * \
len(hyperparameters['min_samples_split']) * \
len(hyperparameters['criterion'])

140

searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)

%%time
searcher.fit(X_train, y_train)

CPU times: user 1.05 s, sys: 1.68 ms, total: 1.05 s
Wall time: 1.06 s

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

searcher.cv_results_['mean_test_score'] # Array of length 140.

array([0.73, 0.73, 0.73, ..., 0.75, 0.74, 0.72])

# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))

searcher.best_params_

{'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 50}

final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

final_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, min_samples_split=50)

# Training accuracy.
final_tree.score(X_train, y_train)

0.7881944444444444

# Testing accuracy.
final_tree.score(X_test, y_test)

0.765625

searcher.score(X_train, y_train)

0.7881944444444444

searcher.score(X_test, y_test)

0.765625

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

Lecture 16 – More Generalization, Decision Trees¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

🙋🙋🏽‍♀️ Slido¶

Practice Exam Question 🤔¶

Review: Bias and Variance¶

Bias and variance¶

Implications of Bias and Variance¶

Example: Linear Regression¶

Summary: Generalization¶

🙋🙋🏽‍♀️ Questions?¶

Decision trees 🌲¶

Example: Should I get groceries?¶

Example: Predicting diabetes¶

Exploring the dataset¶

Building a decision tree¶

Visualizing decision trees¶

Evaluating classifiers¶

About decision trees¶

How do we train?¶

How do we decide on the best split?¶

One (bad) idea:¶

A better idea: entropy¶

Entropy Example¶

🙋🙋🏽‍♀️ Questions?¶

Runtime (optional)¶

Tree depth¶

Decision trees and overfitting¶

Hyperparameters for decision trees¶

Thinking about bias and variance¶

🙋🙋🏽‍♀️ Questions?¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

🙋🙋🏽‍♀️ Questions?¶

	0	1	2	3	...	136	137	138	139
0	0.71	0.71	0.71	0.71	...	0.70	0.68	0.71	0.73
1	0.77	0.77	0.77	0.77	...	0.82	0.83	0.77	0.76
2	0.74	0.74	0.74	0.74	...	0.68	0.72	0.74	0.73
3	0.70	0.70	0.70	0.70	...	0.77	0.79	0.76	0.70
4	0.72	0.72	0.72	0.72	...	0.70	0.71	0.72	0.70