import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

import warnings
warnings.simplefilter('ignore')


sample_1 = pd.read_csv(os.path.join('data', 'sample-1.csv'))
px.scatter(x=sample_1['x'], y=sample_1['y'], template=TEMPLATE)


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score


errs_df = pd.DataFrame()

for d in range(1, 26):
    pl = Pipeline([('poly', PolynomialFeatures(d)), ('lin-reg', LinearRegression())])
    
    # The `scoring` argument is used to specify that we want to compute the RMSE; 
    # the default is R^2. It's called "neg" RMSE because, 
    # by default, sklearn likes to "maximize" scores, and maximizing -RMSE is the same
    # as minimizing RMSE.
    errs = cross_val_score(pl, sample_1[['x']], sample_1['y'], 
                           cv=5, scoring='neg_root_mean_squared_error')
    errs_df[f'Deg {d}'] = -errs # Negate to turn positive (sklearn computed negative RMSE).
    
errs_df.index = [f'Fold {i}' for i in range(1, 6)]
errs_df.index.name = 'Validation Fold'


errs_df


errs_df.mean().idxmin()

'Deg 3'


errs_df.idxmin(axis=1)

Validation Fold
Fold 1    Deg 1
Fold 2    Deg 6
Fold 3    Deg 8
Fold 4    Deg 3
Fold 5    Deg 3
dtype: object


import seaborn as sns
tips = sns.load_dataset('tips')
tips.head()


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder


from sklearn.model_selection import train_test_split


X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


# A dictionary that maps names to Pipeline objects.
pipes = {
    'total_bill only': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
    'total_bill + size': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill', 'size'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
    'total_bill + size + OHE smoker': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill', 'size']),
             ('ohe', OneHotEncoder(), ['smoker'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
    'total_bill + size + OHE all': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill', 'size']),
             ('ohe', OneHotEncoder(), ['smoker', 'sex', 'time', 'day'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
}


pipe_df = pd.DataFrame()

for pipe in pipes:
    errs = cross_val_score(pipes[pipe], X_train, y_train,
                           cv=5, scoring='neg_root_mean_squared_error')
    pipe_df[pipe] = -errs
    
pipe_df.index = [f'Fold {i}' for i in range(1, 6)]
pipe_df.index.name = 'Validation Fold'


pipe_df


pipe_df.mean()

total_bill only                   0.999780
total_bill + size                 0.993870
total_bill + size + OHE smoker    0.992460
total_bill + size + OHE all       1.008038
dtype: float64


pipe_df.mean().idxmin()

'total_bill + size + OHE smoker'


diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()


# 0 means no diabetes, 1 means yes diabetes.
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64


fig = (
    diabetes.assign(Outcome=diabetes['Outcome'].astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes',
                  template=TEMPLATE)
)
fig


X_train, X_test, y_train, y_test = train_test_split(diabetes[['Glucose', 'BMI']], 
                                                    diabetes['Outcome'],
                                                    random_state=1)


from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(max_depth=2)


dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)


from sklearn.tree import plot_tree


plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, rounded=True, fontsize=15, impurity=False);


# Note that the left node at depth 2 has a `value` of [304, 78].
y_train.loc[X_train[X_train['Glucose'] <= 129.5].index].value_counts()

0    304
1     78
Name: Outcome, dtype: int64


(dt.predict(X_train) == y_train).mean()

0.765625


# Training accuracy – same number as above
dt.score(X_train, y_train)

0.765625


# Testing accuracy
dt.score(X_test, y_test)

0.7760416666666666


dt_no_max = DecisionTreeClassifier()
dt_no_max.fit(X_train, y_train)

DecisionTreeClassifier()


dt_no_max.tree_.max_depth

20


dt_no_max.score(X_train, y_train)

0.9913194444444444


# Depth 2 tree.
dt.score(X_train, y_train)

0.765625


dt_no_max.score(X_test, y_test)

0.7291666666666666


# Depth 2 tree.
dt.score(X_test, y_test)

0.7760416666666666

fig


trees = {}
for d in [2, 4, 8]:
    trees[d] = DecisionTreeClassifier(max_depth=d, random_state=1)
    trees[d].fit(X_train, y_train)
    
    plt.figure(figsize=(15, 5), dpi=100)
    plot_tree(trees[d], feature_names=X_train.columns, class_names=['no db', 'yes db'], 
               filled=True, rounded=True, impurity=False)
    
    plt.show()


from sklearn.model_selection import GridSearchCV


hyperparameters = {
    'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}


len(hyperparameters['max_depth']) * \
len(hyperparameters['min_samples_split']) * \
len(hyperparameters['criterion'])

140


searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)


searcher.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})


searcher.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 50}


searcher.cv_results_['mean_test_score'] # Array of length 140.

array([0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 ,
       0.7292054 , 0.72568216, 0.74136432, 0.74136432, 0.74136432,
       0.74136432, 0.74484258, 0.73964018, 0.72568216, 0.74832084,
       0.75005997, 0.75005997, 0.75005997, 0.7517991 , 0.7465967 ,
       0.72568216, 0.73787106, 0.73613193, 0.73961019, 0.73787106,
       0.74658171, 0.7465967 , 0.72568216, 0.72230885, 0.71536732,
       0.72749625, 0.73272864, 0.7413943 , 0.7465967 , 0.72568216,
       0.71184408, 0.71008996, 0.71875562, 0.72574213, 0.73791604,
       0.7465967 , 0.72568216, 0.69106447, 0.69793103, 0.7135982 ,
       0.71361319, 0.73791604, 0.7465967 , 0.72568216, 0.69104948,
       0.69970015, 0.71010495, 0.71187406, 0.73794603, 0.7465967 ,
       0.72568216, 0.69106447, 0.70314843, 0.71353823, 0.71361319,
       0.73965517, 0.7465967 , 0.72568216, 0.68755622, 0.71010495,
       0.7170015 , 0.71704648, 0.74313343, 0.7465967 , 0.72568216,
       0.72398801, 0.72398801, 0.72398801, 0.72398801, 0.72398801,
       0.72398801, 0.72394303, 0.74658171, 0.74658171, 0.74658171,
       0.74658171, 0.74658171, 0.73616192, 0.72394303, 0.7517991 ,
       0.7517991 , 0.75005997, 0.75005997, 0.75005997, 0.73964018,
       0.72394303, 0.75004498, 0.75004498, 0.75178411, 0.75178411,
       0.75353823, 0.73964018, 0.72394303, 0.72923538, 0.73268366,
       0.74482759, 0.74656672, 0.75353823, 0.73964018, 0.72394303,
       0.72404798, 0.71881559, 0.73443778, 0.7413943 , 0.75182909,
       0.73964018, 0.72394303, 0.68409295, 0.69622189, 0.71706147,
       0.73791604, 0.74664168, 0.73964018, 0.72394303, 0.69445277,
       0.6961919 , 0.72223388, 0.73269865, 0.75008996, 0.73964018,
       0.72394303, 0.68757121, 0.68404798, 0.70838081, 0.73269865,
       0.74664168, 0.73964018, 0.72394303, 0.67718141, 0.69103448,
       0.70490255, 0.73269865, 0.74664168, 0.73964018, 0.72394303])


# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))


searcher.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 50}


final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=50)


final_tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=50)


# Training accuracy.
final_tree.score(X_train, y_train)

0.7829861111111112


# Testing accuracy.
final_tree.score(X_test, y_test)

0.765625


searcher.score(X_train, y_train)

0.7829861111111112


searcher.score(X_test, y_test)

0.765625


people = pd.read_csv('data/SOCR-HeightWeight.csv').drop('Index', axis=1)
people.head()


people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds', template=TEMPLATE)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(people[['Height (Inches)']], 
                                                            people['Weight (Pounds)'], 
                                                            random_state=1)


lr_one_feat = LinearRegression()
lr_one_feat.fit(X_train_1, y_train_1)

LinearRegression()


lr_one_feat.intercept_, lr_one_feat.coef_

(-81.18621383194537, array([3.06263359]))


rmse_one_feat = mean_squared_error(y_test_1, 
                                   lr_one_feat.predict(X_test_1), 
                                   squared=False)
rmse_one_feat

10.044723811394736


people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm


X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(people[['Height (Inches)', 'Height (cm)']], 
                                                            people['Weight (Pounds)'], 
                                                            random_state=1)


lr_two_feat = LinearRegression()
lr_two_feat.fit(X_train_2, y_train_2)

LinearRegression()


lr_two_feat.intercept_, lr_two_feat.coef_

(-81.16916912853335, array([ 9.70933657e+11, -3.82257345e+11]))


rmse_two_feat = mean_squared_error(y_test_2, 
                                   lr_two_feat.predict(X_test_2), 
                                   squared=False)
rmse_two_feat

10.044733718499367


(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.34993
1    134.54563
2    128.19622
3    124.64980
4    123.36343
dtype: float64


(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.34993
1    134.54563
2    128.19622
3    124.64980
4    123.36343
dtype: float64

	Deg 1	Deg 2	Deg 3	Deg 4	Deg 5	Deg 6	Deg 7	Deg 8	Deg 9	Deg 10	...	Deg 16	Deg 17	Deg 18	Deg 19	Deg 20	Deg 21	Deg 22	Deg 23	Deg 24	Deg 25
Validation Fold
Fold 1	4.789975	12.814865	5.040947	4.927487	8.853939	13.150006	18.023855	149.574851	897.759319	718.403458	...	175100.073715	1.104271e+06	2.266879e+06	1.240114e+06	4.321891e+06	7.223270e+07	8.771680e+06	6.572019e+07	1.412702e+08	5.907770e+08
Fold 2	3.971600	5.359234	3.186076	3.218728	3.194274	3.007987	3.077643	3.127204	3.098704	3.559998	...	4.123032	5.588813e+00	5.634207e+00	9.640386e+00	2.285274e+01	2.299261e+01	2.929221e+01	7.850246e+01	7.527416e+01	3.133994e+01
Fold 3	4.770176	2.557932	2.083676	2.110708	2.100757	2.013973	2.018524	1.996626	2.186183	3.946495	...	2.682152	2.764271e+00	2.563292e+00	2.743327e+00	1.790691e+01	1.786844e+01	3.029914e+01	3.090584e+01	4.242516e+01	3.723842e+01
Fold 4	6.134098	4.656187	2.925225	2.932392	3.030811	3.040322	3.144730	3.160528	3.310559	3.070833	...	7.248788	3.234748e+00	3.065008e+00	1.165178e+01	7.484083e+00	7.543781e+00	6.274909e+00	3.328219e+01	5.804339e+01	9.691263e+00
Fold 5	11.699003	11.916631	3.235882	4.372698	33.584482	51.745272	56.510688	92.223491	335.804723	1168.510898	...	90577.446197	8.722900e+05	9.204615e+05	8.464799e+06	2.191781e+07	6.018466e+07	8.361500e+06	2.276297e+08	8.168459e+08	6.625413e+09

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	total_bill only	total_bill + size	total_bill + size + OHE smoker	total_bill + size + OHE all
Validation Fold
Fold 1	1.321421	1.273630	1.269500	1.294183
Fold 2	0.951671	0.921717	0.929099	0.934066
Fold 3	0.774420	0.864801	0.858623	0.867915
Fold 4	0.848454	0.838873	0.835130	0.860885
Fold 5	1.102937	1.070329	1.069948	1.083139

	0	1	2	3	4	5	6	7	8	9	...	130	131	132	133	134	135	136	137	138	139
0	0.706897	0.706897	0.706897	0.706897	0.706897	0.706897	0.732759	0.715517	0.715517	0.715517	...	0.681034	0.706897	0.732759	0.620690	0.655172	0.681034	0.698276	0.681034	0.706897	0.732759
1	0.773913	0.773913	0.773913	0.773913	0.773913	0.773913	0.756522	0.756522	0.756522	0.756522	...	0.826087	0.773913	0.756522	0.678261	0.686957	0.756522	0.817391	0.826087	0.773913	0.756522
2	0.739130	0.739130	0.739130	0.739130	0.739130	0.739130	0.730435	0.756522	0.756522	0.756522	...	0.721739	0.739130	0.730435	0.669565	0.686957	0.695652	0.678261	0.721739	0.739130	0.730435
3	0.704348	0.704348	0.704348	0.704348	0.704348	0.704348	0.704348	0.756522	0.756522	0.756522	...	0.791304	0.756522	0.695652	0.721739	0.730435	0.721739	0.765217	0.791304	0.756522	0.695652
4	0.721739	0.721739	0.721739	0.721739	0.721739	0.721739	0.704348	0.721739	0.721739	0.721739	...	0.713043	0.721739	0.704348	0.695652	0.695652	0.669565	0.704348	0.713043	0.721739	0.704348

	Height (Inches)	Weight (Pounds)
0	65.78331	112.9925
1	71.51521	136.4873
2	69.39874	153.0269
3	68.21660	142.3354
4	67.78781	144.2971

Lecture 24 – Decision Trees, Grid Search, Multicollinearity¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Cross-validation¶

Recap¶

$k$-fold cross-validation¶

$k$-fold cross-validation¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

Another example: Tips¶

Summary: Generalization¶

Discussion Question 🤔¶

Example: Decision trees 🌲¶

Example: Predicting diabetes¶

Exploring the dataset¶

Building a decision tree¶

Visualizing decision trees¶

Evaluating classifiers¶

Some questions...¶

Training a decision tree¶

Tree depth¶

Decision trees and overfitting¶

Hyperparameters for decision trees¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Key takeaways¶

Summary, next time¶

Summary¶

Next time¶

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

Lecture 24 – Decision Trees, Grid Search, Multicollinearity¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Cross-validation¶

Recap¶

$k$-fold cross-validation¶

$k$-fold cross-validation¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

Another example: Tips¶

Summary: Generalization¶

Discussion Question 🤔¶

Example: Decision trees 🌲¶

Example: Predicting diabetes¶

Exploring the dataset¶

Building a decision tree¶

Visualizing decision trees¶

Evaluating classifiers¶

Some questions...¶

Training a decision tree¶

Tree depth¶

Decision trees and overfitting¶

Hyperparameters for decision trees¶

Grid search¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Key takeaways¶

Summary, next time¶

Summary¶

Next time¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶