import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

import seaborn as sns

from sklearn.linear_model import LinearRegression


# The dataset is built into plotly (and seaborn)!
# We shuffle here so that the head of the DataFrame contains rows where smoker is Yes and smoker is No,
# purely for illustration purposes (it doesn't change any of the math).
np.random.seed(1)
tips = px.data.tips().sample(frac=1).reset_index(drop=True)
tips.head()


mean_tip = tips['tip'].mean()
mean_tip

2.99827868852459


# Unfortunately, the code to visualize a scatter plot and a line
# in plotly is not all that concise.
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=tips['total_bill'], 
    y=tips['tip'], 
    mode='markers',
    name='Original Data')
)

fig.add_trace(go.Scatter(
    x=[0, 60],
    y=[mean_tip, mean_tip],
    mode='lines',
    name='Constant Prediction (Mean)'
))

fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip',
                  template=TEMPLATE)
fig.update_xaxes(range=[0, 60])


def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))


rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)
rmse_dict

{'constant tip amount': 1.3807999538298952}


model = LinearRegression()
model.fit(X=tips[['total_bill']], y=tips['tip'])

LinearRegression()


fig.add_trace(go.Scatter(
    x=[0, 60],
    y=model.predict([[0], [60]]),
    mode='lines',
    name='Linear: Total Bill Only'
))

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names


all_preds = model.predict(tips[['total_bill']])
rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)
rmse_dict

{'constant tip amount': 1.3807999538298952,
 'one feature: total bill': 1.0178504025697377}


model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()


model_two.predict([[25, 4]])

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([3.75716934])


XX, YY = np.mgrid[0:50:2, 0:8:1]
Z = model_two.intercept_ + model_two.coef_[0] * XX + model_two.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')

fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=tips['total_bill'], 
                           y=tips['size'], 
                           z=tips['tip'], mode='markers', marker = {'color': '#656DF1'}))

fig.update_layout(scene = dict(
    xaxis_title='Total Bill',
    yaxis_title='Table Size',
    zaxis_title='Tip'),
  title='Tip vs. Total Bill and Table Size',
    width=1000, height=800,
    template=TEMPLATE)


rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)


rmse_dict

{'constant tip amount': 1.3807999538298952,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.007256127114662}


model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612565


tips.head()


all_preds = model_two.predict(tips[['total_bill', 'size']])
all_preds[:5]

array([1.14617248, 2.7952968 , 3.71198575, 2.37623251, 3.01595454])


np.var(all_preds) / np.var(tips['tip'])

0.46786930879612576


(np.corrcoef(all_preds, tips['tip'])) ** 2

array([[1.        , 0.46786931],
       [0.46786931, 1.        ]])


model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612565


tips.head()


tips.head()


tips['smoker'].value_counts()

No     151
Yes     93
Name: smoker, dtype: int64


(tips['smoker'] == 'Yes').astype(int).head()

0    1
1    0
2    1
3    0
4    0
Name: smoker, dtype: int64


for val in tips['smoker'].unique():
    tips[f'smoker == {val}'] = (tips['smoker'] == val).astype(int)


tips.head()


model_three = LinearRegression()
model_three.fit(tips[['total_bill', 'size', 'smoker == Yes']], tips['tip'])

LinearRegression()


model_three.intercept_, model_three.coef_

(0.7090155167346053, array([ 0.09388839,  0.18033156, -0.08343255]))


XX, YY = np.mgrid[0:50:2, 0:8:1]
Z_0 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 0
Z_1 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 1
plane_0 = go.Surface(x=XX, y=YY, z=Z_0, colorscale='Greens')
plane_1 = go.Surface(x=XX, y=YY, z=Z_1, colorscale='Purples')

fig = go.Figure(data=[plane_0, plane_1])

tips_0 = tips[tips['smoker'] == 'No']
tips_1 = tips[tips['smoker'] == 'Yes']

fig.add_trace(go.Scatter3d(x=tips_0['total_bill'], 
                           y=tips_0['size'], 
                           z=tips_0['tip'], mode='markers', marker = {'color': 'green'}))

fig.add_trace(go.Scatter3d(x=tips_1['total_bill'], 
                           y=tips_1['size'], 
                           z=tips_1['tip'], mode='markers', marker = {'color': 'purple'}))

fig.update_layout(scene = dict(
    xaxis_title='Total Bill',
    yaxis_title='Table Size',
    zaxis_title='Tip'),
  title='Tip vs. Total Bill and Table Size (Green = Non-Smoking Section, Purple = Smoking Section)',
    width=1000, height=800,
    showlegend=False,
    template=TEMPLATE)


fig = go.Figure()
fig.add_trace(go.Scatter(x=tips['total_bill'], y=tips['tip'], 
                         mode='markers', name='Original Data'))
fig.add_trace(go.Scatter(x=tips['total_bill'], y=model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]), 
                         mode='markers', name='Predicted Tips using Total Bill, Table Size, and Smoker Status'))

fig.update_layout(showlegend=True, template=TEMPLATE, title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')


rmse_dict['three features'] = rmse(tips['tip'], 
                                   model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]))
rmse_dict

{'constant tip amount': 1.3807999538298952,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.007256127114662,
 'three features': 1.0064899786822128}


tips.head()


order_values = ['✩', '✩✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩']
ordinal_enc = {y:x + 1 for (x, y) in enumerate(order_values)}
ordinal_enc

{'✩': 1, '✩✩': 2, '✩✩✩': 3, '✩✩✩✩': 4, '✩✩✩✩✩': 5}


ratings = pd.DataFrame().assign(RATING=['✩', '✩✩', '✩✩✩', '✩✩', '✩✩✩', '✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩'])
ratings


ratings.replace(ordinal_enc)


mpg = sns.load_dataset('mpg').dropna()
mpg.head()


mpg['model_year'].value_counts()

73    40
78    36
76    34
75    30
82    30
70    29
79    29
72    28
77    28
81    28
71    27
80    27
74    26
Name: model_year, dtype: int64


# Note: To create a simple scatter plot, all you need is
# px.scatter(mpg, x='horsepower', y='mpg').
# We've used the more complicated go.Scatter approach here so that we can add
# other lines on top.

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=mpg['horsepower'], 
    y=mpg['mpg'], 
    mode='markers',
    name='Original Data')
)

fig.update_layout(showlegend=True, title='MPG vs. Horsepower',
                  xaxis_title='Horsepower', yaxis_title='MPG',
                  template=TEMPLATE)


car_model = LinearRegression()
car_model.fit(mpg[['horsepower']], mpg['mpg'])

LinearRegression()


fig.add_trace(go.Scatter(
    x=[25, 225],
    y=car_model.predict([[25], [225]]),
    mode='lines',
    name='Predicted MPG using Horsepower'
))

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names


car_model.score(mpg[['horsepower']], mpg['mpg'])

0.6059482578894348


mpg['log hp'] = np.log(mpg['horsepower'])


log_fig = go.Figure()

log_fig.add_trace(go.Scatter(
    x=mpg['log hp'], 
    y=mpg['mpg'], 
    mode='markers',
    name='Original Data')
)

log_fig.update_layout(showlegend=True, title='MPG vs. log(Horsepower)',
                  xaxis_title='log(Horsepower)', yaxis_title='MPG',
                  template=TEMPLATE)


car_model_log = LinearRegression()
car_model_log.fit(mpg[['log hp']], mpg['mpg'])

LinearRegression()


log_fig.add_trace(go.Scatter(
    x=[3.7, 5.5],
    y=car_model_log.predict([[3.7], [5.5]]),
    mode='lines',
    name='Predicted MPG using log(Horsepower)'
))

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names


car_model_log.score(mpg[['log hp']], mpg['mpg'])

0.6683347641192137


fig.add_trace(
    go.Scatter(
        x=mpg['horsepower'], 
        y=car_model_log.intercept_ + car_model_log.coef_[0] * np.log(mpg['horsepower']),  
        mode='markers', name='Predicted MPG using log(Horsepower)', marker_color='red'
    )
)


car_model_log.intercept_, car_model_log.coef_

(108.69970699574486, array([-18.58218476]))

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
3	14.26	2.50	Male	No	Thur	Lunch	2
4	21.16	3.00	Male	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
3	14.26	2.50	Male	No	Thur	Lunch	2
4	21.16	3.00	Male	No	Thur	Lunch	2

Property	Example	Description
Initialize model parameters	`lr = LinearRegression()`	Create (empty) linear regression model
Fit the model to the data	`lr.fit(X, y)`	Determines regression coefficients
Use model for prediction	`lr.predict(X_new)`	Uses regression line to make predictions
Evaluate the model	`lr.score(X, y)`	Calculates the $R^2$ of the LR model
Access model attributes	`lr.coef_`, `lr.intercept_`	Accesses the regression coefficients and intercept

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
3	14.26	2.50	Male	No	Thur	Lunch	2
4	21.16	3.00	Male	No	Thur	Lunch	2

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
3	14.26	2.50	Male	No	Thur	Lunch	2
4	21.16	3.00	Male	No	Thur	Lunch	2

Lecture 21 – Feature Engineering¶

DSC 80, Winter 2023¶

📣 Announcements¶

Agenda¶

Case study: Restaurant tips 🧑‍🍳¶

Model #1: Constant¶

Model #2: Simple linear regression using total bill¶

Model #3: Multiple linear regression using total bill and table size¶

Plane of best fit ✈️¶

Comparing models, again¶

The `.score` method of a `LinearRegression` object¶

Aside: $R^2$¶

Calculating $R^2$¶

`LinearRegression` summary¶

What's next?¶

Feature engineering ⚙️¶

The goal of feature engineering¶

One hot encoding¶

Example: One hot encoding `'smoker'`¶

Model #4: Multiple linear regression using total bill, table size, and smoker status¶

Visualizing Model #4¶

Comparing Model #4 to earlier models¶

Reflection¶

Example: Predicting ratings ⭐️¶

Example: Predicting ratings ⭐️¶

Uninformative features¶

Dropping features¶

Encoding ordinal features¶

Encoding nominal features¶

Example: Horsepower 🚗¶

The relationship between `'horsepower'` and `'mpg'`¶

Predicting `'mpg'` using `'horsepower'`¶

Transformations¶

Predicting `'mpg'` using `log('horsepower')`¶

Quantitative scaling¶

Summary, next time¶

Summary¶

Next time¶

UID	AGE	STATE	HAS_BOUGHT	REVIEW	\
74	32	NY	True	"Meh."	\|	✩✩
42	50	WA	True	"Worked out of the box..."	\|	✩✩✩✩
57	16	CA	NULL	"Hella tots lit yo..."	\|	✩
...	...	...	...	...	\|	...
(int)	(int)	(str)	(bool)	(str)	\|	(str)

	RATING
0	✩
1	✩✩
2	✩✩✩
3	✩✩
4	✩✩✩
5	✩
6	✩✩✩
7	✩✩✩✩
8	✩✩✩✩✩

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino

Lecture 21 – Feature Engineering¶

DSC 80, Winter 2023¶

📣 Announcements¶

Agenda¶

Case study: Restaurant tips 🧑‍🍳¶

Model #1: Constant¶

Model #2: Simple linear regression using total bill¶

Model #3: Multiple linear regression using total bill and table size¶

Plane of best fit ✈️¶

Comparing models, again¶

The .score method of a LinearRegression object¶

Aside: $R^2$¶

Calculating $R^2$¶

LinearRegression summary¶

What's next?¶

Feature engineering ⚙️¶

The goal of feature engineering¶

One hot encoding¶

Example: One hot encoding 'smoker'¶

Model #4: Multiple linear regression using total bill, table size, and smoker status¶

Visualizing Model #4¶

Comparing Model #4 to earlier models¶

Reflection¶

Example: Predicting ratings ⭐️¶

Example: Predicting ratings ⭐️¶

Uninformative features¶

Dropping features¶

Encoding ordinal features¶

Encoding nominal features¶

Example: Horsepower 🚗¶

The relationship between 'horsepower' and 'mpg'¶

Predicting 'mpg' using 'horsepower'¶

Transformations¶

Predicting 'mpg' using log('horsepower')¶

Quantitative scaling¶

Summary, next time¶

Summary¶

Next time¶

The `.score` method of a `LinearRegression` object¶

`LinearRegression` summary¶

Example: One hot encoding `'smoker'`¶

The relationship between `'horsepower'` and `'mpg'`¶

Predicting `'mpg'` using `'horsepower'`¶

Predicting `'mpg'` using `log('horsepower')`¶