from dsc80_utils import *
import lec15_util as util

tips = px.data.tips()

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

pl = ...

pl.fit(tips_cat, tips['tip'])

pl.predict(tips_cat.iloc[:5])

pl.named_steps

pl.named_steps['one-hot'].transform(tips_cat).toarray()

pl.named_steps['one-hot'].get_feature_names_out()

pl.named_steps['lin-reg'].coef_

# Why is this so low?
pl.score(tips_cat, tips['tip'])

from sklearn.compose import ColumnTransformer

tips_features = tips.drop('tip', axis=1)
tips_features.head()

from sklearn.preprocessing import Binarizer

preproc = ColumnTransformer(
    transformers=[
        ...
    ],
    remainder='...' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)

pl = Pipeline([
    ...,
    ('lin-reg', LinearRegression())
])

pl.fit(tips_features, tips['tip'])

tips_features.head()

# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

from sklearn.preprocessing import FunctionTransformer

f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

# Old code

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' 
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

preproc = make_column_transformer(
    (Binarizer(threshold=2), ['size']),
    (OneHotEncoder(), ['sex', 'smoker', 'day', 'time']),
    remainder='passthrough',
)

pl = make_pipeline(preproc, LinearRegression())
# Notice that the steps in the pipeline and column transformer are
# automatically named
pl

# Let's define these once, since we'll use them repeatedly.
X = tips[['total_bill', 'size']]
y = tips['tip']

from sklearn.preprocessing import StandardScaler

model_with_std = make_pipeline(
    StandardScaler(),
    LinearRegression(),
)


model_with_std.fit(X, y)

model_with_std.score(X, y)

from sklearn.metrics import mean_squared_error

mean_squared_error(y, model_with_std.predict(X), squared=False)

model_without_std = LinearRegression()
model_without_std.fit(X, y)

model_without_std.score(X, y)

mean_squared_error(y, model_without_std.predict(X), squared=False)

# Total bill, table size.
model_without_std.coef_

# Total bill, table size.
model_with_std.named_steps['linearregression'].coef_

def is_weekend(s):
    # The input to is_weekend is a Series!
    return s.replace({'Thur': 'Weekday', 'Fri': 'Weekday'})

pl_day = make_pipeline(
    FunctionTransformer(is_weekend),
    OneHotEncoder(),
)

col_trans = make_column_transformer(
    (pl_day, ['day']),
    (OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    (Binarizer(threshold=2), ['size']),
    remainder='passthrough')

pl = make_pipeline(
    col_trans,
    LinearRegression(),
)

pl.fit(tips.drop('tip', axis=1), tips['tip'])

people_path = Path('data') / 'SOCR-HeightWeight.csv'
people = pd.read_csv(people_path).drop(columns=['Index'])
people.head()

people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds')

X = people[['Height (Inches)']]
y = people['Weight (Pounds)']

lr_one_feat = LinearRegression()
lr_one_feat.fit(X, y)

lr_one_feat.intercept_, lr_one_feat.coef_

mean_squared_error(y, lr_one_feat.predict(X), squared=False)

people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm

X2 = people[['Height (Inches)', 'Height (cm)']]

lr_two_feat = LinearRegression()
lr_two_feat.fit(X2, y)

lr_two_feat.intercept_, lr_two_feat.coef_

mean_squared_error(y, lr_two_feat.predict(X2), squared=False)

(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

# Note: The FunctionTransformer is only needed to change the result
# of the OneHotEncoder from a "sparse" matrix to a regular matrix
# so that it can be used with StandardScaler;
# it doesn't change anything mathematically.
pl = Pipeline([
    ("ohe", OneHotEncoder(drop="first")),
    ("ft", FunctionTransformer(lambda X: X.toarray())),
    ("ss", StandardScaler())
])

np.random.seed(23) # For reproducibility.

def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures

# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

# Look at the definition of train_and_plot in lec15_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25], data_name='Sample 1')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.model_selection import train_test_split

# Read the documentation!
train_test_split?

X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # We don't have to choose 0.25.

print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

X_train.shape[0] / tips.shape[0]

tips.head()

X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips.
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # random_state is like np.random.seed.

lr = LinearRegression()
lr.fit(X_train, y_train)

pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, pred_train, squared=False)
rmse_train

pred_test = lr.predict(X_test)
rmse_test = mean_squared_error(y_test, pred_test, squared=False)
rmse_test

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

px.scatter(sample_1, x='x', y='y', title='Sample 1')

X = sample_1[['x']]
y = sample_1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

train_errs = []
test_errs = []

for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(mean_squared_error(y_train, pl.predict(X_train), squared=False))
    test_errs.append(mean_squared_error(y_test, pl.predict(X_test), squared=False))

errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})

fig = px.line(errs)
fig.update_layout(showlegend=True, xaxis_title='Polynomial Degree', yaxis_title='RMSE')

	size	x0_Female	x0_Male	x1_No	x2_Sun	x3_Dinner	total_bill
0	0	1.0	0.0	1.0	1.0	1.0	16.99
1	1	0.0	1.0	1.0	1.0	1.0	10.34
2	1	0.0	1.0	1.0	1.0	1.0	21.01
3	0	0.0	1.0	1.0	1.0	1.0	23.68
4	1	1.0	0.0	1.0	1.0	1.0	24.59

Lecture 15 – Pipelines, Multicollinearity, and Generalization¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

💡 Pro-Tip: Using make_pipeline and make_column_transformer¶

An example Pipeline¶

The purpose of standardizing features¶

Aside: Pipelines of just transformers¶

Question 🤔 (Answer at q.dsc80.com)

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

Example: One hot encoding¶

Key takeaways¶

Question 🤔 (Answer at q.dsc80.com)

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Risk vs. empirical risk¶

The bias-variance decomposition¶

Navigating the bias-variance tradeoff¶

Question 🤔 (Answer at q.dsc80.com)

Train-test splits¶

Avoiding overfitting¶

Train-test split 🚆¶

Example train-test split¶

Hyperparameters¶

Example: Polynomial regression¶

Parameters vs. hyperparameters¶

Training error vs. test error¶

Training error vs. test error¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Summary, next time¶

Summary¶

Next time¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

💡 Pro-Tip: Using `make_pipeline` and `make_column_transformer`¶

An example `Pipeline`¶

Aside: `Pipeline`s of just transformers¶