from dsc80_utils import *

# Carryover setup from last lecture
import seaborn as sns
tips = sns.load_dataset('tips')

from sklearn.linear_model import LinearRegression

import lec15_util as util

people = pd.read_csv('data/SOCR-HeightWeight.csv').drop(columns=['Index'])
people.head()

people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds')

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = people[['Height (Inches)']]
y = people['Weight (Pounds)']

lr_one_feat = LinearRegression()
lr_one_feat.fit(X, y)

LinearRegression()

lr_one_feat.intercept_, lr_one_feat.coef_

(-82.57574306454099, array([3.08]))

rmse_one_feat = mean_squared_error(
    y, lr_one_feat.predict(X), squared=False
)
rmse_one_feat

10.079113675632819

people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm

X2 = people[['Height (Inches)', 'Height (cm)']]

lr_two_feat = LinearRegression()
lr_two_feat.fit(X2, y)

LinearRegression()

lr_two_feat.intercept_, lr_two_feat.coef_

(-82.57585227669999, array([-2.46e+10,  9.67e+09]))

rmse_two_feat = mean_squared_error(
    y, lr_two_feat.predict(X2), squared=False)
rmse_two_feat

10.079113005376787

(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures

# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])

# Look at the definition of train_and_plot in lec15_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.model_selection import train_test_split

# Read the documentation!
train_test_split?

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # We don't have to choose 0.25.

print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 195

Rows in X_test: 49

X_train.shape[0] / tips.shape[0]

0.7991803278688525

tips.head()

X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips.
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random_state is like np.random.seed.

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

from sklearn.metrics import mean_squared_error # Built-in RMSE/MSE function.

pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared=False)
rmse_train

0.9803205287924737

pred_test = lr.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared=False)
rmse_test

1.138177129113125

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

px.scatter(sample_1, x='x', y='y', title='Sample 1')

X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# Pipeline with degree-2 polynomial features
pl = Pipeline([('poly', PolynomialFeatures(2)), ('lin-reg', LinearRegression())])

# Same pipeline, notice that make_pipeline generates names for pipeline steps
pl = make_pipeline(PolynomialFeatures(2), LinearRegression())
pl

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

train_errs = []
test_errs = []

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(mean_squared_error(y_train, pl.predict(X_train), squared=False))
    test_errs.append(mean_squared_error(y_test, pl.predict(X_test), squared=False))

errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})

fig = px.line(errs)
fig.update_layout(showlegend=True, xaxis_title='Degree', yaxis_title='RMSE')

from sklearn.model_selection import KFold

data = np.arange(10, 70, 10)
data

array([10, 20, 30, 40, 50, 60])

kfold = KFold(3, shuffle=True, random_state=1)
kfold

KFold(n_splits=3, random_state=1, shuffle=True)

for train, val in kfold.split(data):
    print(f'train: {data[train]}, validation: {data[val]}')

train: [10 40 50 60], validation: [20 30]
train: [20 30 40 60], validation: [10 50]
train: [10 20 30 50], validation: [40 60]

cross_val_score(estimator, X_train, y_train, cv)

from sklearn.model_selection import cross_val_score

errs_df = pd.DataFrame()

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    
    # The `scoring` argument is used to specify that we want to compute the RMSE; 
    # the default is R^2. It's called "neg" RMSE because, 
    # by default, sklearn likes to "maximize" scores, and maximizing -RMSE is the same
    # as minimizing RMSE.
    errs = cross_val_score(pl, sample_1[['x']], sample_1['y'], 
                           cv=5, scoring='neg_root_mean_squared_error')
    errs_df[f'Deg {d}'] = -errs # Negate to turn positive (sklearn computed negative RMSE).
    
errs_df.index = [f'Fold {i}' for i in range(1, 6)]
errs_df.index.name = 'Validation Fold'

errs_df

errs_df.mean().idxmin()

'Deg 3'

errs_df.idxmin(axis=1)

Validation Fold
Fold 1    Deg 1
Fold 2    Deg 6
Fold 3    Deg 8
Fold 4    Deg 3
Fold 5    Deg 3
dtype: object

px.scatter(sample_1, x='x', y='y', title='Sample 1')

# make_column_transformer is a shortcut for the ColumnTransformer class
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# A dictionary that maps names to Pipeline objects.
pipes = {
    'total_bill only': make_pipeline(
        make_column_transformer(
            (FunctionTransformer(lambda x: x), ['total_bill']),
        ),
        LinearRegression(),
    ),
    'total_bill + size': make_pipeline(
        make_column_transformer(
            (FunctionTransformer(lambda x: x), ['total_bill', 'size']),
        ),
        LinearRegression(),
    ),
    'total_bill + size + OHE smoker': make_pipeline(
        make_column_transformer(
            (FunctionTransformer(lambda x: x), ['total_bill', 'size']),
            (OneHotEncoder(drop='first'), ['smoker']),
        ),
        LinearRegression(),
    ),
    'total_bill + size + OHE all': make_pipeline(
        make_column_transformer(
            (FunctionTransformer(lambda x: x), ['total_bill', 'size']),
            (OneHotEncoder(drop='first'), ['smoker', 'sex', 'time', 'day']),
        ),
        LinearRegression(),
    ),
}

pipe_df = pd.DataFrame()

for pipe in pipes:
    errs = cross_val_score(pipes[pipe], X_train, y_train,
                           cv=5, scoring='neg_root_mean_squared_error')
    pipe_df[pipe] = -errs
    
pipe_df.index = [f'Fold {i}' for i in range(1, 6)]
pipe_df.index.name = 'Validation Fold'

pipe_df

pipe_df.mean()

total_bill only                   1.00
total_bill + size                 0.99
total_bill + size + OHE smoker    0.99
total_bill + size + OHE all       1.01
dtype: float64

pipe_df.mean().idxmin()

'total_bill + size + OHE smoker'

	Height (Inches)	Weight (Pounds)
0	65.78	112.99
1	71.52	136.49
2	69.40	153.03
3	68.22	142.34
4	67.79	144.30

	total_bill	sex	smoker	day	time	size
4	24.59	Female	No	Sun	Dinner	4
209	12.76	Female	Yes	Sat	Dinner	2
178	9.60	Female	Yes	Sun	Dinner	2
230	24.01	Male	Yes	Sat	Dinner	4
5	25.29	Male	No	Sun	Dinner	4

	total_bill	sex	smoker	day	time	size
146	18.64	Female	No	Thur	Lunch	3
224	13.42	Male	Yes	Fri	Lunch	2
134	18.26	Female	No	Thur	Lunch	2
131	20.27	Female	No	Thur	Lunch	2
147	11.87	Female	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	Deg 1	Deg 2	Deg 3	Deg 4	...	Deg 22	Deg 23	Deg 24	Deg 25
Validation Fold
Fold 1	4.79	12.81	5.04	4.93	...	8.78e+06	6.57e+07	1.41e+08	5.90e+08
Fold 2	3.97	5.36	3.19	3.22	...	2.93e+01	7.85e+01	7.53e+01	3.13e+01
Fold 3	4.77	2.56	2.08	2.11	...	3.03e+01	3.09e+01	4.24e+01	3.72e+01
Fold 4	6.13	4.66	2.93	2.93	...	6.27e+00	3.33e+01	5.80e+01	9.69e+00
Fold 5	11.70	11.92	3.24	4.37	...	8.36e+06	2.28e+08	8.17e+08	6.63e+09

	total_bill only	total_bill + size	total_bill + size + OHE smoker	total_bill + size + OHE all
Validation Fold
Fold 1	1.32	1.27	1.27	1.29
Fold 2	0.95	0.92	0.93	0.93
Fold 3	0.77	0.86	0.86	0.87
Fold 4	0.85	0.84	0.84	0.86
Fold 5	1.10	1.07	1.07	1.08

Lecture 15 – Multicollinearity, Generalization, Cross-validation¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

🙋🙋🏽‍♀️ Slido¶

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Key takeaways¶

Example: One-hot encoding¶

🙋🙋🏽‍♀️ Questions?¶

Generalization¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Train-test split¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

🙋🙋🏽‍♀️ Questions?¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

💡 Pro-Tip: Using make_pipeline¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

🙋🙋🏽‍♀️ Questions?¶

Cross-validation¶

A single validation set¶

$k$-fold cross-validation¶

Creating folds in sklearn¶

$k$-fold cross-validation¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

$k$-fold cross-validation in sklearn¶

Another example: Tips¶

Summary: Generalization¶

Discussion Question 🤔¶

🙋🙋🏽‍♀️ Questions?¶

💡 Pro-Tip: Using `make_pipeline`¶

Creating folds in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶

$k$-fold cross-validation in `sklearn`¶