from dsc80_utils import *

# The dataset is built into plotly (and seaborn)!
# We shuffle here so that the head of the DataFrame contains rows where smoker is Yes and smoker is No,
# purely for illustration purposes (it doesn't change any of the math).
np.random.seed(1)
tips = px.data.tips().sample(frac=1).reset_index(drop=True)

def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

tips

from sklearn.linear_model import LinearRegression

mean_tip = tips['tip'].mean()

model = LinearRegression()
model.fit(X=tips[['total_bill']], y=tips['tip'])

model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()

rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)

all_preds = model.predict(tips[['total_bill']])
rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)

rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)
rmse_dict

{'constant tip amount': 1.3807999538298956,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.0072561271146618}

pred = tips.assign(predicted=model_two.predict(tips[['total_bill', 'size']]))
pred

np.var(pred['predicted']) / np.var(pred['tip'])

0.4678693087961255

# There was a typo here last lecture, the correct code is:
(np.corrcoef(pred['predicted'], pred['tip'])) ** 2

array([[1.  , 0.47],
       [0.47, 1.  ]])

model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612565

tips.head()

tips.head()

tips['smoker'].value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

(tips['smoker'] == 'Yes').astype(int).head()

0    1
1    0
2    1
3    0
4    0
Name: smoker, dtype: int64

for val in tips['smoker'].unique():
    tips[f'smoker == {val}'] = (tips['smoker'] == val).astype(int)

tips.head()

model_three = LinearRegression()
model_three.fit(tips[['total_bill', 'size', 'smoker == Yes']], tips['tip'])

LinearRegression()

model_three.intercept_, model_three.coef_

(0.7090155167346053, array([ 0.09,  0.18, -0.08]))

XX, YY = np.mgrid[0:50:2, 0:8:1]
Z_0 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 0
Z_1 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 1
plane_0 = go.Surface(x=XX, y=YY, z=Z_0, colorscale='Greens')
plane_1 = go.Surface(x=XX, y=YY, z=Z_1, colorscale='Purples')

fig = go.Figure(data=[plane_0, plane_1])

tips_0 = tips[tips['smoker'] == 'No']
tips_1 = tips[tips['smoker'] == 'Yes']

fig.add_trace(go.Scatter3d(x=tips_0['total_bill'], 
                           y=tips_0['size'], 
                           z=tips_0['tip'], mode='markers', marker = {'color': 'green'}))

fig.add_trace(go.Scatter3d(x=tips_1['total_bill'], 
                           y=tips_1['size'], 
                           z=tips_1['tip'], mode='markers', marker = {'color': 'purple'}))

fig.update_layout(scene = dict(
    xaxis_title='Total Bill',
    yaxis_title='Table Size',
    zaxis_title='Tip'),
  title='Tip vs. Total Bill and Table Size (Green = Non-Smoking Section, Purple = Smoking Section)',
    width=1000, height=800,
    showlegend=False)

fig = go.Figure()
fig.add_trace(go.Scatter(x=tips['total_bill'], y=tips['tip'], 
                         mode='markers', name='Original Data'))
fig.add_trace(go.Scatter(x=tips['total_bill'], y=model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]), 
                         mode='markers', name='Predicted Tips using Total Bill, <br>Table Size, and Smoker Status'))

fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

rmse_dict['three features'] = rmse(tips['tip'], 
                                   model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]))
rmse_dict

{'constant tip amount': 1.3807999538298956,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.0072561271146618,
 'three features': 1.0064899786822126}

tips.head()

ordinal_enc = {
    '✩': 1,
    '✩✩': 2,
    '✩✩✩': 3,
    '✩✩✩✩': 4,
    '✩✩✩✩✩': 5,
}
ordinal_enc

{'✩': 1, '✩✩': 2, '✩✩✩': 3, '✩✩✩✩': 4, '✩✩✩✩✩': 5}

ratings = pd.DataFrame().assign(RATING=['✩', '✩✩', '✩✩✩', '✩✩', '✩✩✩', '✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩'])
ratings

ratings.replace(ordinal_enc)

mpg = sns.load_dataset('mpg').dropna()
mpg.head()

mpg['model_year'].value_counts()

73    40
78    36
76    34
      ..
71    27
80    27
74    26
Name: model_year, Length: 13, dtype: int64

fig = px.scatter(mpg, x='horsepower', y='mpg')
fig

car_model = LinearRegression()
car_model.fit(mpg[['horsepower']], mpg['mpg'])

LinearRegression()

hp_points = pd.DataFrame({'horsepower': [25, 225]})
fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(go.Scatter(
    x=hp_points['horsepower'],
    y=car_model.predict(hp_points),
    mode='lines',
    name='Predicted MPG using Horsepower'
))

res = mpg.assign(
    pred=car_model.predict(mpg[['horsepower']]),
    resid=mpg['mpg'] - car_model.predict(mpg[['horsepower']]),
)
fig = px.scatter(res, x='pred', y='resid')
fig.add_hline(0, line_width=3, opacity=1)

r2 = car_model.score(mpg[['horsepower']], mpg['mpg'])
print(f'Model R²: {r2:.2f}')

Model R²: 0.61

mpg['log hp'] = np.log(mpg['horsepower'])

px.scatter(mpg, x='log hp', y='mpg')

car_model_log = LinearRegression()
car_model_log.fit(mpg[['log hp']], mpg['mpg'])

LinearRegression()

fig = px.scatter(mpg, x='log hp', y='mpg')
log_hp_points = pd.DataFrame({'log hp': [3.7, 5.5]})
fig = px.scatter(mpg, x='log hp', y='mpg')
fig.add_trace(go.Scatter(
    x=log_hp_points['log hp'],
    y=car_model_log.predict(log_hp_points),
    mode='lines',
    name='Predicted MPG using log(Horsepower)'
))

car_model_log.score(mpg[['log hp']], mpg['mpg'])

0.6683347641192137

fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(
    go.Scatter(
        x=mpg['horsepower'], 
        y=car_model_log.intercept_ + car_model_log.coef_[0] * np.log(mpg['horsepower']),  
        mode='markers', name='Predicted MPG using log(Horsepower)', marker_color='red'
    )
)
fig

car_model_log.intercept_, car_model_log.coef_

(108.69970699574486, array([-18.58]))

tips.head()

from sklearn.preprocessing import Binarizer

tips['total_bill'].head()

0     3.07
1    18.78
2    26.59
3    14.26
4    21.16
Name: total_bill, dtype: float64

bi = Binarizer(threshold=20)

transformed_bills = bi.transform(tips[['total_bill']]) # Must give transform a 2D array/DataFrame.
transformed_bills[:5]

/Users/sam/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning:

X has feature names, but Binarizer was fitted without feature names

array([[0.],
       [0.],
       [1.],
       [0.],
       [1.]])

tips_quant = tips[['total_bill', 'size']]
tips_quant.head()

from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()

stdscaler.transform(tips_quant)

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[48], line 1
----> 1 stdscaler.transform(tips_quant)

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:970, in StandardScaler.transform(self, X, copy)
    955 def transform(self, X, copy=None):
    956     """Perform standardization by centering and scaling.
    957 
    958     Parameters
   (...)
    968         Transformed array.
    969     """
--> 970     check_is_fitted(self)
    972     copy = copy if copy is not None else self.copy
    973     X = self._validate_data(
    974         X,
    975         reset=False,
   (...)
    980         force_all_finite="allow-nan",
    981     )

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/validation.py:1222, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1217     fitted = [
   1218         v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
   1219     ]
   1221 if not fitted:
-> 1222     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# This is like saying "determine the mean and SD of each column in tips_quant".
stdscaler.fit(tips_quant)

StandardScaler()

# First column is 'total_bill', second column is 'size'.
tips_quant_z = stdscaler.transform(tips_quant)
tips_quant_z[:5]

array([[-1.88, -1.65],
       [-0.11, -0.6 ],
       [ 0.77,  0.45],
       [-0.62, -0.6 ],
       [ 0.15, -0.6 ]])

stdscaler.mean_

array([19.79,  2.57])

stdscaler.var_

array([78.93,  0.9 ])

stdscaler.transform(tips_quant.sample(5))

array([[ 0.97, -0.6 ],
       [-0.28, -0.6 ],
       [ 1.19,  1.51],
       [-0.49, -0.6 ],
       [ 0.65,  1.51]])

stdscaler.fit_transform(tips_quant)

array([[-1.88, -1.65],
       [-0.11, -0.6 ],
       [ 0.77,  0.45],
       ...,
       [-0.26, -0.6 ],
       [-1.09, -0.6 ],
       [-0.32,  0.45]])

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ohe.fit(tips_cat)

OneHotEncoder()

ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]

ohe.transform(tips_cat)

<244x10 sparse matrix of type '<class 'numpy.float64'>'
	with 976 stored elements in Compressed Sparse Row format>

ohe.transform(tips_cat).toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

ohe.get_feature_names_out() # x0, x1, x2, and x3 correspond to column names in tips_cat.

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

pd.DataFrame(ohe.transform(tips_cat).toarray(), 
             columns=ohe.get_feature_names_out()) # If we need a DataFrame back, for some reason.

pl.fit(X, y)

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.pipeline import Pipeline

pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])

pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])

pl.predict(tips_cat.iloc[:5])

array([2.92, 3.16, 3.09, 2.83, 2.83])

pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}

pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

pl.named_steps['lin-reg'].coef_

array([-0.09,  0.09, -0.04,  0.04, -0.2 , -0.13,  0.14,  0.19,  0.25,
       -0.25])

pl.score(tips_cat, tips['tip'])

0.02749679020147533

from sklearn.compose import ColumnTransformer

tips_features = tips.drop('tip', axis=1)
tips_features.head()

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])

pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

tips_features.head()

# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

array([1.16, 2.81, 3.7 , 2.4 , 3.07])

from sklearn.preprocessing import FunctionTransformer

f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

array([1.  , 1.41, 1.73])

np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures

# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])

# Look at the definition of train_and_plot in util.py if you're curious as to how the plotting works.
import lec14_util as util
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

	mpg	cylinders	displacement	horsepower	...	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	...	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	...	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	...	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	...	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	...	10.5	70	usa	ford torino

Property	Example	Description
Initialize with parameters	`binar = Binarizer(thresh)`	set x=1 if x > thresh, else 0
Transform data in a dataset	`feat = binar.transform(data)`	Binarize all columns in `data`

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`
Fit and transform	`stdscaler.fit_transform(X)`	Compute the mean and SD of `X`, then z-score `X`

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
...	...	...	...	...	...	...	...
241	17.47	3.50	Female	No	Thur	Lunch	2
242	10.07	1.25	Male	No	Sat	Dinner	2
243	16.93	3.07	Female	No	Sat	Dinner	3

UID	AGE	STATE	HAS_BOUGHT	REVIEW	\
74	32	NY	True	"Meh."	\|	✩✩
42	50	WA	True	"Worked out of the box..."	\|	✩✩✩✩
57	16	CA	NULL	"Hella tots lit yo..."	\|	✩
...	...	...	...	...	\|	...
(int)	(int)	(str)	(bool)	(str)	\|	(str)

	RATING
0	✩
1	✩✩
2	✩✩✩
...	...
6	✩✩✩
7	✩✩✩✩
8	✩✩✩✩✩

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0

	size	x0_Female	x0_Male	x1_No	x2_Sun	x3_Dinner	total_bill
0	0	1.0	0.0	1.0	1.0	1.0	16.99
1	1	0.0	1.0	1.0	1.0	1.0	10.34
2	1	0.0	1.0	1.0	1.0	1.0	21.01
3	0	0.0	1.0	1.0	1.0	1.0	23.68
4	1	1.0	0.0	1.0	1.0	1.0	24.59

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0

Lecture 14 – Feature Engineering, Pipelines¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

🙋🙋🏽‍♀️ Slido¶

Review: Linear Models for Restaurant tips 🧑‍🍳¶

Calculating $R^2$¶

What's next?¶

Feature engineering ⚙️¶

The goal of feature engineering¶

One hot encoding¶

Example: One hot encoding 'smoker'¶

Model #4: Multiple linear regression using total bill, table size, and smoker status¶

Visualizing Model #4¶

🙋🙋🏽‍♀️ Questions?¶

Comparing Model #4 to earlier models¶

Reflection¶

Example: Predicting ratings ⭐️¶

Example: Predicting ratings ⭐️¶

Uninformative features¶

Dropping features¶

Encoding ordinal features¶

Encoding nominal features¶

Example: Horsepower 🚗¶

The relationship between 'horsepower' and 'mpg'¶

Predicting 'mpg' using 'horsepower'¶

Transformations¶

Predicting 'mpg' using log('horsepower')¶

Quantitative scaling¶

🙋🙋🏽‍♀️ Questions?¶

The modeling process¶

The modeling process¶

preprocessing and linear_models¶

Transformers in sklearn¶

Transformer classes¶

Case study: Restaurant tips 🧑‍🍳¶

Example transformer: Binarizer¶

Example transformer: StdScaler¶

Example transformer: StdScaler¶

💡 Pro-Tip: Using .fit_transform¶

StdScaler summary¶

Example transformer: OneHotEncoder¶

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

Summary: Pipelines¶

🙋🙋🏽‍♀️ Questions?¶

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Example: One hot encoding `'smoker'`¶

The relationship between `'horsepower'` and `'mpg'`¶

Predicting `'mpg'` using `'horsepower'`¶

Predicting `'mpg'` using `log('horsepower')`¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `StdScaler`¶

💡 Pro-Tip: Using `.fit_transform`¶

`StdScaler` summary¶

Example transformer: `OneHotEncoder`¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

Summary: `Pipeline`s¶

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0