import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)

import warnings
warnings.simplefilter('ignore')


tips = sns.load_dataset('tips')
tips.head()


lr = LinearRegression()

# Note that there are two arguments to fit – X and y!
# (It is not necessary to write X= and y=)
lr.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()


# Predicted tip from a table of 3 that spends $25 
lr.predict([[25, 3]])

array([3.56457154])


# Predicted tip from a table of 14 that spends $1000 – probably not accurate!
lr.predict([[1000, 14]])

array([96.07865069])


lr.intercept_

0.6689447408125031


lr.coef_

array([0.09271334, 0.19259779])


all_preds = lr.predict(tips[['total_bill', 'size']])


np.sqrt(np.mean((all_preds - tips['tip']) ** 2))

1.007256127114662


lr.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612587


tips.head()


all_preds[:5]

array([2.62933992, 2.20539403, 3.19464533, 3.24959215, 3.71915687])


np.var(all_preds) / np.var(tips['tip'])

0.4678693087961252


(np.corrcoef(all_preds, tips['tip'])) ** 2

array([[1.        , 0.46786931],
       [0.46786931, 1.        ]])


lr.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612587


tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])


pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])


pl.predict([['Male', 'Yes', 'Sat', 'Lunch']])

array([2.58813051])


pl.predict(tips_cat.iloc[:5])

array([3.10415414, 3.27436302, 3.27436302, 3.27436302, 3.10415414])


pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}


pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])


pl.named_steps['lin-reg'].coef_

array([-0.08510444,  0.08510444, -0.04216238,  0.04216238, -0.20256076,
       -0.12962763,  0.13756057,  0.19462781,  0.25168453, -0.25168453])


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


tips_features = tips.drop('tip', axis=1)
tips_features.head()


preproc = ColumnTransformer(
    transformers = [
        ('quant', StandardScaler(), ['total_bill', 'size']),
        ('cat', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ]
)


pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])


pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('quant', StandardScaler(),
                                                  ['total_bill', 'size']),
                                                 ('cat', OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])


tips_features.head()


pl.predict(tips_features.head())

array([2.73565486, 2.25086733, 3.25904369, 3.33533199, 3.80574011])


pl.score(tips_features, tips['tip'])

0.47007812322060794


pl.named_steps['preprocessor'].transform(tips_features)

array([[-0.31471131, -0.60019263,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-1.06323531,  0.45338292,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.1377799 ,  0.45338292,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.3246295 , -0.60019263,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.2212865 , -0.60019263,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.1132289 , -0.60019263,  1.        , ...,  1.        ,
         1.        ,  0.        ]])


np.random.seed(23) # For reproducibility

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return x.reshape(-1, 1), y

x1, y1 = sample_dgp()
x2, y2 = sample_dgp()


plt.scatter(x1, y1);


from sklearn.preprocessing import PolynomialFeatures


d2 = PolynomialFeatures(3)
d2.fit_transform([[1], [2], [3], [4], [5]]) # fit_transform fits and transforms the same input

array([[  1.,   1.,   1.,   1.],
       [  1.,   2.,   4.,   8.],
       [  1.,   3.,   9.,  27.],
       [  1.,   4.,  16.,  64.],
       [  1.,   5.,  25., 125.]])


degs = [1, 3, 15]

def fit_polys(x, y):
    # Create all three Pipelines
    pls = [
        Pipeline([('poly', PolynomialFeatures(d)), ('lin-reg', LinearRegression())])
        for d in degs
    ]

    # Fit all three Pipelines
    [pl.fit(x, y) for pl in pls];

    # Make all three sets of predictions
    preds = [pl.predict(x) for pl in pls]
    return preds
    
preds_1 = fit_polys(x1, y1)


plt.subplots(1, 3, figsize=(15, 5), dpi=100)
plt.suptitle('Performance on Sample 1\n')

for i in range(1, 4):
    plt.subplot(1, 3, i)
    plt.scatter(x1, y1, label='actual')
    plt.plot(x1, preds_1[i-1], color='orange', label='predictions', linewidth=5)
    rmse_d = np.sqrt(np.mean((preds_1[i-1] - y1) ** 2))
    plt.title(f'Degree {degs[i-1]}, RMSE: {np.round(rmse_d, 2)}')
plt.legend(loc=(1.04, 0))
plt.show()


plt.subplots(1, 3, figsize=(15, 5), dpi=100)
plt.suptitle('Performance on Sample 2\n')

for i in range(1, 4):
    plt.subplot(1, 3, i)
    plt.scatter(x2, y2, label='actual')
    plt.plot(x1, preds_1[i-1], color='orange', label='predictions', linewidth=5)
    rmse_d = np.sqrt(np.mean((preds_1[i-1] - y2) ** 2))
    plt.title(f'Degree {degs[i-1]}, RMSE: {np.round(rmse_d, 2)}')
plt.legend(loc=(1.04, 0))
plt.show()


preds_2 = fit_polys(x2, y2)

plt.subplots(1, 3, figsize=(15, 5), dpi=100)
plt.suptitle('Models Fit on Samples 1 and 2\n')

for i in range(1, 4):
    plt.subplot(1, 3, i)
#     plt.scatter(x2, y2, label='actual')
    plt.plot(x1, preds_1[i-1], color='orange', label='Fit on Sample 1', linewidth=5)
    plt.plot(x2, preds_2[i-1], color='purple', label='Fit on Sample 2', linewidth=5)
    plt.title(f'Degree {degs[i-1]}')
plt.legend(loc=(1.04, 0))
plt.show()


from sklearn.model_selection import train_test_split


# Read the documentation!
train_test_split?


X = tips.drop('tip', axis=1)
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Don't have to choose 0.25


print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 195

Rows in X_test: 49


X_train.shape[0] / tips.shape[0]

0.7991803278688525


X = tips[['total_bill', 'size']] # For this example, we'll use just the already-quantitative columns in tips
y = tips['tip']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()


pred_train = lr.predict(X_train)
rmse_train = np.sqrt(np.mean((pred_train - y_train) ** 2))
rmse_train

1.0142193530579229


pred_test = lr.predict(X_test)
rmse_test = np.sqrt(np.mean((pred_test - y_test) ** 2))
rmse_test

0.983914318376069

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Property	Example	Description
Initialize model parameters	`lr = LinearRegression()`	Create (empty) linear regression model
Fit the model to the data	`lr.fit(data, responses)`	Determines regression coefficients
Use model for prediction	`lr.predict(newdata)`	Use regression line make predictions
Evaluate the model	`lr.score(data, responses)`	Calculate the $R^2$ of the LR model
Access model attributes	`lr.coef_`	Access the regression coefficients

	total_bill	sex	smoker	day	time	size
0	16.99	Female	No	Sun	Dinner	2
1	10.34	Male	No	Sun	Dinner	3
2	21.01	Male	No	Sun	Dinner	3
3	23.68	Male	No	Sun	Dinner	2
4	24.59	Female	No	Sun	Dinner	4

	total_bill	sex	smoker	day	time	size
0	16.99	Female	No	Sun	Dinner	2
1	10.34	Male	No	Sun	Dinner	3
2	21.01	Male	No	Sun	Dinner	3
3	23.68	Male	No	Sun	Dinner	2
4	24.59	Female	No	Sun	Dinner	4

Lecture 23 – Pipelines and Evaluation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Models in `sklearn`¶

Example: Predicting `'tip'` from `'total_bill'` and `'size'`¶

Aside: $R^2$¶

Calculating $R^2$¶

`LinearRegression` summary¶

Pipelines¶

`Pipeline`s in `sklearn`¶

Creating a `Pipeline`¶

More sophisticated `Pipeline`s¶

Model evaluation 🧪¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Bias and variance¶

Avoiding overfitting¶

Train-test split 🚆¶

Example prediction pipeline¶

Summary, next time¶

Summary¶

	total_bill	sex	smoker	day	time	size
146	18.64	Female	No	Thur	Lunch	3
224	13.42	Male	Yes	Fri	Lunch	2
134	18.26	Female	No	Thur	Lunch	2
131	20.27	Female	No	Thur	Lunch	2
147	11.87	Female	No	Thur	Lunch	2

Lecture 23 – Pipelines and Evaluation¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Models in sklearn¶

Example: Predicting 'tip' from 'total_bill' and 'size'¶

Aside: $R^2$¶

Calculating $R^2$¶

LinearRegression summary¶

Pipelines¶

Pipelines in sklearn¶

Creating a Pipeline¶

More sophisticated Pipelines¶

Model evaluation 🧪¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Bias and variance¶

Avoiding overfitting¶

Train-test split 🚆¶

Example prediction pipeline¶

Summary, next time¶

Summary¶

Models in `sklearn`¶

Example: Predicting `'tip'` from `'total_bill'` and `'size'`¶

`LinearRegression` summary¶

`Pipeline`s in `sklearn`¶

Creating a `Pipeline`¶

More sophisticated `Pipeline`s¶