from dsc80_utils import *
import lec15_util as util

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error # New!

import seaborn as sns
tips = sns.load_dataset('tips')
tips.head()

# Let's define these once, since we'll use them repeatedly.
X = tips[['total_bill', 'size']]
y = tips['tip']

model_with_std = Pipeline([
    ('standardize', StandardScaler()),
    ('lin-reg', LinearRegression())
])


model_with_std.fit(X, y)

Pipeline(steps=[('standardize', StandardScaler()),
                ('lin-reg', LinearRegression())])

model_with_std.score(X, y)

0.46786930879612587

mean_squared_error(y, model_with_std.predict(X), squared=False)

1.007256127114662

model_without_std = LinearRegression()
model_without_std.fit(X, y)

LinearRegression()

model_without_std.score(X, y)

0.46786930879612587

mean_squared_error(y, model_without_std.predict(X), squared=False)

1.007256127114662

# Total bill, table size.
model_without_std.coef_

array([0.09, 0.19])

# Total bill, table size.
model_with_std.named_steps['lin-reg'].coef_

array([0.82, 0.18])

people_path = Path('data') / 'SOCR-HeightWeight.csv'
people = pd.read_csv(people_path).drop(columns=['Index'])
people.head()

people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds')

X = people[['Height (Inches)']]
y = people['Weight (Pounds)']

lr_one_feat = LinearRegression()
lr_one_feat.fit(X, y)

LinearRegression()

lr_one_feat.intercept_, lr_one_feat.coef_

(-82.57574306454093, array([3.08]))

mean_squared_error(y, lr_one_feat.predict(X), squared=False)

10.079113675632819

people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm

X2 = people[['Height (Inches)', 'Height (cm)']]

lr_two_feat = LinearRegression()
lr_two_feat.fit(X2, y)

LinearRegression()

lr_two_feat.intercept_, lr_two_feat.coef_

(-82.57525639659859, array([ 3.38e+10, -1.33e+10]))

mean_squared_error(y, lr_two_feat.predict(X2), squared=False)

10.079113677131511

(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

def is_weekend(s):
    # The input to is_weekend is a Series!
    return s.replace({'Thur': 'Weekday', 'Fri': 'Weekday'})

from sklearn.preprocessing import Binarizer, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

pl_day = Pipeline([
    ('is-weekend', FunctionTransformer(is_weekend)),
    ('ohe', OneHotEncoder(drop='first'))
])

col_trans = ColumnTransformer([
    ('transform-day', pl_day, ['day']),
    ('ohe-others', OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    ('binarize-size', Binarizer(threshold=2), ['size'])
], remainder='passthrough')

pl = Pipeline([
    ('transf', col_trans),
    ('lin-reg', LinearRegression())
])

pl.fit(tips.drop('tip', axis=1), tips['tip'])

Pipeline(steps=[('transf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('transform-day',
                                                  Pipeline(steps=[('is-weekend',
                                                                   FunctionTransformer(func=<function is_weekend at 0x164182af0>)),
                                                                  ('ohe',
                                                                   OneHotEncoder(drop='first'))]),
                                                  ['day']),
                                                 ('ohe-others',
                                                  OneHotEncoder(drop='first'),
                                                  ['sex', 'smoker', 'time']),
                                                 ('binarize-size',
                                                  Binarizer(threshold=2),
                                                  ['size'])])),
                ('lin-reg', LinearRegression())])

pl.named_steps['lin-reg'].coef_

array([ 0.12,  0.13, -0.03, -0.09, -0.05,  0.27,  0.1 ])

np.random.seed(23) # For reproducibility.

def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures

# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])

# Look at the definition of train_and_plot in lec15_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25], data_name='Sample 1')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.model_selection import train_test_split

# Read the documentation!
train_test_split?

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # We don't have to choose 0.25.

print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 195

Rows in X_test: 49

X_train.shape[0] / tips.shape[0]

0.7991803278688525

tips.head()

X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips.
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random_state is like np.random.seed.

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared=False)
rmse_train

0.9803205287924736

pred_test = lr.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared=False)
rmse_test

1.1381771291131253

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

px.scatter(sample_1, x='x', y='y', title='Sample 1')

X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# Pipeline with degree-2 polynomial features
pl = Pipeline([('poly', PolynomialFeatures(2)), ('lin-reg', LinearRegression())])

# Same pipeline, notice that make_pipeline generates names for pipeline steps
pl = make_pipeline(PolynomialFeatures(2), LinearRegression())
pl

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

train_errs = []
test_errs = []

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(mean_squared_error(y_train, pl.predict(X_train), squared=False))
    test_errs.append(mean_squared_error(y_test, pl.predict(X_test), squared=False))

errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})

fig = px.line(errs)
fig.update_layout(showlegend=True, xaxis_title='Polynomial Degree', yaxis_title='RMSE')

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	Height (Inches)	Weight (Pounds)
0	65.78	112.99
1	71.52	136.49
2	69.40	153.03
3	68.22	142.34
4	67.79	144.30

	total_bill	sex	smoker	day	time	size
4	24.59	Female	No	Sun	Dinner	4
209	12.76	Female	Yes	Sat	Dinner	2
178	9.60	Female	Yes	Sun	Dinner	2
230	24.01	Male	Yes	Sat	Dinner	4
5	25.29	Male	No	Sun	Dinner	4

	total_bill	sex	smoker	day	time	size
146	18.64	Female	No	Thur	Lunch	3
224	13.42	Male	Yes	Fri	Lunch	2
134	18.26	Female	No	Thur	Lunch	2
131	20.27	Female	No	Thur	Lunch	2
147	11.87	Female	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 15 – Standardization, Multicollinearity, and Generalization¶

DSC 80, Winter 2024¶

Announcements 📣¶

RSVP to the senior capstone showcase on March 15th!¶

Look at the list of topics and RSVP at hdsishowcase.com!

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Standardization¶

Review: Transformers, models, and Pipelines¶

An example Pipeline¶

The purpose of standardizing features¶

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Example: One hot encoding¶

Aside: Pipelines of just transformers¶

Key takeaways¶

Question 🤔 (Answer at q.dsc80.com)

Exercise

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Risk vs. empirical risk¶

The bias-variance decomposition¶

Navigating the bias-variance tradeoff¶

Question 🤔 (Answer at q.dsc80.com)

Train-test splits¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

💡 Pro-Tip: Using make_pipeline¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Summary, next time¶

Summary¶

Next time¶

Review: Transformers, models, and `Pipeline`s¶

An example `Pipeline`¶

Aside: `Pipeline`s of just transformers¶

💡 Pro-Tip: Using `make_pipeline`¶