import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

# Carryover setup from last lecture
import seaborn as sns
tips = sns.load_dataset('tips')

from sklearn.linear_model import LinearRegression

import util

import warnings
warnings.simplefilter('ignore')


np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()


# Look at the definition of train_and_plot in util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')


fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')


util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


from sklearn.model_selection import train_test_split


# Read the documentation!
train_test_split?


X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # We don't have to choose 0.25.


print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 195

Rows in X_test: 49


X_train.shape[0] / tips.shape[0]

0.7991803278688525


tips.head()


X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips.
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random_state is like np.random.seed.


lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()


from sklearn.metrics import mean_squared_error # Built-in RMSE/MSE function.


pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared=False)
rmse_train

0.9803205287924737


pred_test = lr.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared=False)
rmse_test

1.138177129113125


fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')


px.scatter(sample_1, x='x', y='y', title='Sample 1', template=TEMPLATE)


X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures


train_errs = []
test_errs = []

for d in range(1, 26):
    pl = Pipeline([('poly', PolynomialFeatures(d)), ('lin-reg', LinearRegression())])
    pl.fit(X_train, y_train)
    train_errs.append(mean_squared_error(y_train, pl.predict(X_train), squared=False))
    test_errs.append(mean_squared_error(y_test, pl.predict(X_test), squared=False))


fig = go.Figure()

fig.add_trace(
    go.Scatter(x=np.arange(1, 26), y=train_errs, name='Training Error')
)

fig.add_trace(
    go.Scatter(x=np.arange(1, 26), y=test_errs, name='Test Error', line={'color': 'orange'})
)

fig.update_layout(showlegend=True, xaxis_title='Degree', yaxis_title='RMSE')


from sklearn.model_selection import KFold


data = np.arange(10, 70, 10)
data

array([10, 20, 30, 40, 50, 60])


kfold = KFold(3, shuffle=True, random_state=1)
kfold

KFold(n_splits=3, random_state=1, shuffle=True)


for train, val in kfold.split(data):
    print(f'train: {data[train]}, validation: {data[val]}')

train: [10 40 50 60], validation: [20 30]
train: [20 30 40 60], validation: [10 50]
train: [10 20 30 50], validation: [40 60]

cross_val_score(estimator, X_train, y_train, cv)


from sklearn.model_selection import cross_val_score


errs_df = pd.DataFrame()

for d in range(1, 26):
    pl = Pipeline([('poly', PolynomialFeatures(d)), ('lin-reg', LinearRegression())])
    
    # The `scoring` argument is used to specify that we want to compute the RMSE; 
    # the default is R^2. It's called "neg" RMSE because, 
    # by default, sklearn likes to "maximize" scores, and maximizing -RMSE is the same
    # as minimizing RMSE.
    errs = cross_val_score(pl, sample_1[['x']], sample_1['y'], 
                           cv=5, scoring='neg_root_mean_squared_error')
    errs_df[f'Deg {d}'] = -errs # Negate to turn positive (sklearn computed negative RMSE).
    
errs_df.index = [f'Fold {i}' for i in range(1, 6)]
errs_df.index.name = 'Validation Fold'


errs_df


errs_df.mean().idxmin()

'Deg 3'


errs_df.idxmin(axis=1)

Validation Fold
Fold 1    Deg 1
Fold 2    Deg 6
Fold 3    Deg 8
Fold 4    Deg 3
Fold 5    Deg 3
dtype: object


px.scatter(sample_1, x='x', y='y', title='Sample 1', template=TEMPLATE)


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder


X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


# A dictionary that maps names to Pipeline objects.
pipes = {
    'total_bill only': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
    'total_bill + size': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill', 'size'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
    'total_bill + size + OHE smoker': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill', 'size']),
             ('ohe', OneHotEncoder(), ['smoker'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
    'total_bill + size + OHE all': Pipeline([
        ('trans', ColumnTransformer(
            [('keep', FunctionTransformer(lambda x: x), ['total_bill', 'size']),
             ('ohe', OneHotEncoder(), ['smoker', 'sex', 'time', 'day'])], 
            remainder='drop')), 
        ('lin-reg', LinearRegression())
    ]),
}


pipe_df = pd.DataFrame()

for pipe in pipes:
    errs = cross_val_score(pipes[pipe], X_train, y_train,
                           cv=5, scoring='neg_root_mean_squared_error')
    pipe_df[pipe] = -errs
    
pipe_df.index = [f'Fold {i}' for i in range(1, 6)]
pipe_df.index.name = 'Validation Fold'


pipe_df


pipe_df.mean()

total_bill only                   0.999780
total_bill + size                 0.993870
total_bill + size + OHE smoker    0.992460
total_bill + size + OHE all       1.008038
dtype: float64


pipe_df.mean().idxmin()

'total_bill + size + OHE smoker'

	total_bill	sex	smoker	day	time	size
4	24.59	Female	No	Sun	Dinner	4
209	12.76	Female	Yes	Sat	Dinner	2
178	9.60	Female	Yes	Sun	Dinner	2
230	24.01	Male	Yes	Sat	Dinner	4
5	25.29	Male	No	Sun	Dinner	4

	total_bill	sex	smoker	day	time	size
146	18.64	Female	No	Thur	Lunch	3
224	13.42	Male	Yes	Fri	Lunch	2
134	18.26	Female	No	Thur	Lunch	2
131	20.27	Female	No	Thur	Lunch	2
147	11.87	Female	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	Deg 1	Deg 2	Deg 3	Deg 4	Deg 5	Deg 6	Deg 7	Deg 8	Deg 9	Deg 10	...	Deg 16	Deg 17	Deg 18	Deg 19	Deg 20	Deg 21	Deg 22	Deg 23	Deg 24	Deg 25
Validation Fold
Fold 1	4.789975	12.814865	5.040947	4.927487	8.853939	13.150006	18.023855	149.574851	897.759319	718.403458	...	175100.068052	1.104271e+06	2.266879e+06	1.240114e+06	4.321905e+06	7.223294e+07	8.776463e+06	6.569429e+07	1.412632e+08	5.904442e+08
Fold 2	3.971600	5.359234	3.186076	3.218728	3.194274	3.007987	3.077643	3.127204	3.098704	3.559998	...	4.123032	5.588813e+00	5.634207e+00	9.640386e+00	2.285274e+01	2.299261e+01	2.929238e+01	7.850249e+01	7.527421e+01	3.134011e+01
Fold 3	4.770176	2.557932	2.083676	2.110708	2.100757	2.013973	2.018524	1.996626	2.186183	3.946495	...	2.682152	2.764271e+00	2.563292e+00	2.743327e+00	1.790691e+01	1.786843e+01	3.029923e+01	3.090558e+01	4.242389e+01	3.723872e+01
Fold 4	6.134098	4.656187	2.925225	2.932392	3.030811	3.040322	3.144730	3.160528	3.310559	3.070833	...	7.248788	3.234748e+00	3.065008e+00	1.165178e+01	7.484082e+00	7.543781e+00	6.274907e+00	3.328220e+01	5.804341e+01	9.691200e+00
Fold 5	11.699003	11.916631	3.235882	4.372698	33.584482	51.745272	56.510688	92.223491	335.804723	1168.510898	...	90577.446204	8.722900e+05	9.204615e+05	8.464799e+06	2.191781e+07	6.018466e+07	8.361500e+06	2.276297e+08	8.168459e+08	6.625413e+09

	total_bill only	total_bill + size	total_bill + size + OHE smoker	total_bill + size + OHE all
Validation Fold
Fold 1	1.321421	1.273630	1.269500	1.294183
Fold 2	0.951671	0.921717	0.929099	0.934066
Fold 3	0.774420	0.864801	0.858623	0.867915
Fold 4	0.848454	0.838873	0.835130	0.860885
Fold 5	1.102937	1.070329	1.069948	1.083139

Lecture 23 – Cross-Validation¶

DSC 80, Spring 2023¶

Agenda¶

Generalization¶

Bias and variance¶

Train-test split¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Cross-validation¶

A single validation set¶

$k$-fold cross-validation¶

Creating folds in `sklearn`¶

$k$-fold cross-validation¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

Another example: Tips¶

Summary: Generalization¶

Discussion Question 🤔¶

Summary, next time¶

Summary¶

Next time¶

Lecture 23 – Cross-Validation¶

DSC 80, Spring 2023¶

Agenda¶

Generalization¶

Bias and variance¶

Train-test split¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Cross-validation¶

A single validation set¶

$k$-fold cross-validation¶

Creating folds in sklearn¶

$k$-fold cross-validation¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

Another example: Tips¶

Summary: Generalization¶

Discussion Question 🤔¶

Summary, next time¶

Summary¶

Next time¶

Creating folds in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶