from dsc80_utils import *

# The dataset is built into plotly (and seaborn)!
# We shuffle here so that the head of the DataFrame contains rows where smoker is Yes and smoker is No,
# purely for illustration purposes (it doesn't change any of the math).
np.random.seed(1)
tips = px.data.tips().sample(frac=1).reset_index(drop=True)

tips

mean_tip = tips['tip'].mean()

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X=tips[['total_bill']], y=tips['tip'])

model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()

def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)

all_preds = model.predict(tips[['total_bill']])
rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)

rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)

pd.DataFrame({'rmse': rmse_dict.values()}, index=rmse_dict.keys())

model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612565

pred = tips.assign(predicted=model_two.predict(tips[['total_bill', 'size']]))
pred

np.var(pred['predicted']) / np.var(pred['tip'])

0.4678693087961255

pred.corr().loc['predicted', 'tip'] ** 2

0.46786930879612554

model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612565

1 - rmse(pred['tip'], pred['predicted']) ** 2 / np.var(pred['tip'])

0.4678693087961261

tips.head()

tips.head()

tips['smoker'].value_counts()

No     151
Yes     93
Name: smoker, dtype: int64

(tips['smoker'] == 'Yes').astype(int).head()

0    1
1    0
2    1
3    0
4    0
Name: smoker, dtype: int64

for val in tips['smoker'].unique():
    tips[f'smoker == {val}'] = (tips['smoker'] == val).astype(int)

tips.head()

model_three = LinearRegression()
model_three.fit(tips[['total_bill', 'size', 'smoker == Yes']], tips['tip'])

LinearRegression()

model_three.intercept_, model_three.coef_

(0.7090155167346053, array([ 0.09,  0.18, -0.08]))

# pio.renderers.default = 'plotly_mimetype+notebook' # If it doesn't render, try uncommenting this.

XX, YY = np.mgrid[0:50:2, 0:8:1]
Z_0 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 0
Z_1 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 1
plane_0 = go.Surface(x=XX, y=YY, z=Z_0, colorscale='Greens')
plane_1 = go.Surface(x=XX, y=YY, z=Z_1, colorscale='Purples')

fig = go.Figure(data=[plane_0, plane_1])

tips_0 = tips[tips['smoker'] == 'No']
tips_1 = tips[tips['smoker'] == 'Yes']

fig.add_trace(go.Scatter3d(x=tips_0['total_bill'], 
                           y=tips_0['size'], 
                           z=tips_0['tip'], mode='markers', marker = {'color': 'green'}))

fig.add_trace(go.Scatter3d(x=tips_1['total_bill'], 
                           y=tips_1['size'], 
                           z=tips_1['tip'], mode='markers', marker = {'color': 'purple'}))

fig.update_layout(scene = dict(
    xaxis_title='Total Bill',
    yaxis_title='Table Size',
    zaxis_title='Tip'),
  title='Tip vs. Total Bill and Table Size (Green = Non-Smoking Section, Purple = Smoking Section)',
    width=1000, height=800,
    showlegend=False)

fig = go.Figure()
fig.add_trace(go.Scatter(x=tips['total_bill'], y=tips['tip'], 
                         mode='markers', name='Original Data'))
fig.add_trace(go.Scatter(x=tips['total_bill'], y=model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]), 
                         mode='markers', name='Predicted Tips using Total Bill, <br>Table Size, and Smoker Status'))

fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

rmse_dict['three features'] = rmse(tips['tip'], 
                                   model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]))
rmse_dict

{'constant tip amount': 1.3807999538298956,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.0072561271146618,
 'three features': 1.0064899786822126}

tips.head()

ordinal_enc = {
    '✩': 1,
    '✩✩': 2,
    '✩✩✩': 3,
    '✩✩✩✩': 4,
    '✩✩✩✩✩': 5,
}
ordinal_enc

{'✩': 1, '✩✩': 2, '✩✩✩': 3, '✩✩✩✩': 4, '✩✩✩✩✩': 5}

ratings = pd.DataFrame().assign(RATING=['✩', '✩✩', '✩✩✩', '✩✩', '✩✩✩', '✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩'])
ratings

ratings.replace(ordinal_enc)

mpg = sns.load_dataset('mpg').dropna()
mpg.head()

mpg['model_year'].value_counts()

73    40
78    36
76    34
      ..
71    27
80    27
74    26
Name: model_year, Length: 13, dtype: int64

px.scatter(mpg, x='horsepower', y='mpg')

car_model = LinearRegression()
car_model.fit(mpg[['horsepower']], mpg['mpg'])

LinearRegression()

hp_points = pd.DataFrame({'horsepower': [25, 225]})
fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(go.Scatter(
    x=hp_points['horsepower'],
    y=car_model.predict(hp_points),
    mode='lines',
    name='Predicted MPG using Horsepower'
))

res = mpg.assign(
    Predictions=car_model.predict(mpg[['horsepower']]),
    Residuals=mpg['mpg'] - car_model.predict(mpg[['horsepower']]),
)
fig = px.scatter(res, x='Predictions', y='Residuals')
fig.add_hline(0, line_width=3, opacity=1)

car_model.score(mpg[['horsepower']], mpg['mpg'])

0.6059482578894348

mpg['log hp'] = np.log(mpg['horsepower'])

px.scatter(mpg, x='log hp', y='mpg')

car_model_log = LinearRegression()
car_model_log.fit(mpg[['log hp']], mpg['mpg'])

LinearRegression()

fig = px.scatter(mpg, x='log hp', y='mpg')
log_hp_points = pd.DataFrame({'log hp': [3.7, 5.5]})
fig = px.scatter(mpg, x='log hp', y='mpg')
fig.add_trace(go.Scatter(
    x=log_hp_points['log hp'],
    y=car_model_log.predict(log_hp_points),
    mode='lines',
    name='Predicted MPG using log(Horsepower)'
))

car_model_log.score(mpg[['log hp']], mpg['mpg'])

0.6683347641192137

fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(
    go.Scatter(
        x=mpg['horsepower'], 
        y=car_model_log.intercept_ + car_model_log.coef_[0] * np.log(mpg['horsepower']),  
        mode='markers', name='Predicted MPG using log(Horsepower)'
    )
)
fig

car_model_log.intercept_, car_model_log.coef_

(108.69970699574486, array([-18.58]))

tips.head()

from sklearn.preprocessing import Binarizer

tips['total_bill'].head()

0     3.07
1    18.78
2    26.59
3    14.26
4    21.16
Name: total_bill, dtype: float64

bi = Binarizer(threshold=20)

transformed_bills = bi.transform(tips[['total_bill']]) # Must give transform a 2D array/DataFrame.
transformed_bills[:5]

/Users/sam/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:443: UserWarning:

X has feature names, but Binarizer was fitted without feature names

array([[0.],
       [0.],
       [1.],
       [0.],
       [1.]])

tips_quant = tips[['total_bill', 'size']]
tips_quant.head()

from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()

stdscaler.transform(tips_quant)

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[50], line 1
----> 1 stdscaler.transform(tips_quant)

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:970, in StandardScaler.transform(self, X, copy)
    955 def transform(self, X, copy=None):
    956     """Perform standardization by centering and scaling.
    957 
    958     Parameters
   (...)
    968         Transformed array.
    969     """
--> 970     check_is_fitted(self)
    972     copy = copy if copy is not None else self.copy
    973     X = self._validate_data(
    974         X,
    975         reset=False,
   (...)
    980         force_all_finite="allow-nan",
    981     )

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/validation.py:1222, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1217     fitted = [
   1218         v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
   1219     ]
   1221 if not fitted:
-> 1222     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# This is like saying "determine the mean and SD of each column in tips_quant".
stdscaler.fit(tips_quant)

StandardScaler()

# First column is 'total_bill', second column is 'size'.
tips_quant_z = stdscaler.transform(tips_quant)
tips_quant_z[:5]

array([[-1.88, -1.65],
       [-0.11, -0.6 ],
       [ 0.77,  0.45],
       [-0.62, -0.6 ],
       [ 0.15, -0.6 ]])

stdscaler.mean_

array([19.79,  2.57])

stdscaler.var_

array([78.93,  0.9 ])

stdscaler.transform(tips_quant.sample(5))

array([[ 0.97, -0.6 ],
       [-0.28, -0.6 ],
       [ 1.19,  1.51],
       [-0.49, -0.6 ],
       [ 0.65,  1.51]])

stdscaler.fit_transform(tips_quant)

array([[-1.88, -1.65],
       [-0.11, -0.6 ],
       [ 0.77,  0.45],
       ...,
       [-0.26, -0.6 ],
       [-1.09, -0.6 ],
       [-0.32,  0.45]])

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
ohe.fit(tips_cat)

OneHotEncoder()

ohe.transform(tips_cat)

<244x10 sparse matrix of type '<class 'numpy.float64'>'
	with 976 stored elements in Compressed Sparse Row format>

ohe.transform(tips_cat).toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

ohe.get_feature_names_out() # x0, x1, x2, and x3 correspond to column names in tips_cat.

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

pd.DataFrame(ohe.transform(tips_cat).toarray(), 
             columns=ohe.get_feature_names_out()) # If we need a DataFrame back, for some reason.

tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()

from sklearn.pipeline import Pipeline

pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])

pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])

pl.predict(tips_cat.iloc[:5])

array([2.92, 3.16, 3.09, 2.83, 2.83])

pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}

pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.]])

pl.named_steps['one-hot'].get_feature_names_out()

array(['sex_Female', 'sex_Male', 'smoker_No', 'smoker_Yes', 'day_Fri',
       'day_Sat', 'day_Sun', 'day_Thur', 'time_Dinner', 'time_Lunch'],
      dtype=object)

pl.named_steps['lin-reg'].coef_

array([-0.09,  0.09, -0.04,  0.04, -0.2 , -0.13,  0.14,  0.19,  0.25,
       -0.25])

# Why is this so low?
pl.score(tips_cat, tips['tip'])

0.02749679020147533

from sklearn.compose import ColumnTransformer

tips_features = tips.drop('tip', axis=1)
tips_features.head()

preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)

pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])

pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])

tips_features.head()

# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

array([1.16, 2.81, 3.7 , 2.4 , 3.07])

from sklearn.preprocessing import FunctionTransformer

f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

array([1.  , 1.41, 1.73])

	mpg	cylinders	displacement	horsepower	...	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	...	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	...	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	...	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	...	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	...	10.5	70	usa	ford torino

Property	Example	Description
Initialize with parameters	`binar = Binarizer(thresh)`	set x=1 if x > thresh, else 0
Transform data in a dataset	`feat = binar.transform(data)`	Binarize all columns in `data`

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`
Fit and transform	`stdscaler.fit_transform(X)`	Compute the mean and SD of `X`, then z-score `X`

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
...	...	...	...	...	...	...	...
241	17.47	3.50	Female	No	Thur	Lunch	2
242	10.07	1.25	Male	No	Sat	Dinner	2
243	16.93	3.07	Female	No	Sat	Dinner	3

	rmse
constant tip amount	1.38
one feature: total bill	1.02
two features	1.01

UID	AGE	STATE	HAS_BOUGHT	REVIEW	\
74	32	NY	True	"Meh."	\|	✩✩
42	50	WA	True	"Worked out of the box..."	\|	✩✩✩✩
57	16	CA	NULL	"Sick af..."	\|	✩
...	...	...	...	...	\|	...
(int)	(int)	(str)	(bool)	(str)	\|	(str)

	RATING
0	✩
1	✩✩
2	✩✩✩
...	...
6	✩✩✩
7	✩✩✩✩
8	✩✩✩✩✩

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0

	size	x0_Female	x0_Male	x1_No	x2_Sun	x3_Dinner	total_bill
0	0	1.0	0.0	1.0	1.0	1.0	16.99
1	1	0.0	1.0	1.0	1.0	1.0	10.34
2	1	0.0	1.0	1.0	1.0	1.0	21.01
3	0	0.0	1.0	1.0	1.0	1.0	23.68
4	1	1.0	0.0	1.0	1.0	1.0	24.59

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0

Lecture 14 – Feature Engineering¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Review: Predicting tips 🧑‍🍳¶

Linear models¶

Root mean squared error¶

The .score method of a LinearRegression object¶

Aside: $R^2$¶

Calculating $R^2$¶

Relationship between $R^2$ and RMSE¶

What's next?¶

Feature engineering ⚙️¶

The goal of feature engineering¶

One hot encoding¶

Example: One hot encoding 'smoker'¶

Model #4: Multiple linear regression using total bill, table size, and smoker status¶

Visualizing Model #4¶

Comparing Model #4 to earlier models¶

Reflection¶

Question 🤔 (Answer at q.dsc80.com)

Example: Predicting ratings ⭐️¶

Example: Predicting ratings ⭐️¶

Uninformative features¶

Dropping features¶

Encoding ordinal features¶

Encoding nominal features¶

Example: Horsepower 🚗¶

The relationship between 'horsepower' and 'mpg'¶

Predicting 'mpg' using 'horsepower'¶

Linearization¶

Predicting 'mpg' using log('horsepower')¶

Quantitative scaling¶

Question 🤔 (Answer at q.dsc80.com)

The modeling process¶

The modeling process¶

preprocessing and linear_models¶

Transformers in sklearn¶

Transformer classes¶

Example: Predicting tips 🧑‍🍳¶

Example transformer: Binarizer¶

Example transformer: StandardScaler¶

Example transformer: StandardScaler¶

💡 Pro-Tip: Using .fit_transform¶

StandardScaler summary¶

Example transformer: OneHotEncoder¶

Question 🤔 (Answer at q.dsc80.com)

We ended lecture here.

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

Summary, next time¶

Summary¶

Next time¶

The `.score` method of a `LinearRegression` object¶

Example: One hot encoding `'smoker'`¶

The relationship between `'horsepower'` and `'mpg'`¶

Predicting `'mpg'` using `'horsepower'`¶

Predicting `'mpg'` using `log('horsepower')`¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Example transformer: `Binarizer`¶

Example transformer: `StandardScaler`¶

Example transformer: `StandardScaler`¶

💡 Pro-Tip: Using `.fit_transform`¶

`StandardScaler` summary¶

Example transformer: `OneHotEncoder`¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

	sex_Female	sex_Male	smoker_No	smoker_Yes	...	day_Sun	day_Thur	time_Dinner	time_Lunch
0	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0	0.0
1	1.0	0.0	1.0	0.0	...	0.0	1.0	1.0	0.0
2	0.0	1.0	0.0	1.0	...	0.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...
241	1.0	0.0	1.0	0.0	...	0.0	1.0	0.0	1.0
242	0.0	1.0	1.0	0.0	...	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	...	0.0	0.0	1.0	0.0