import pandas as pd
import numpy as np
import os
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'
import seaborn as sns
from sklearn.linear_model import LinearRegression
# The dataset is built into plotly (and seaborn)!
# We shuffle here so that the head of the DataFrame contains rows where smoker is Yes and smoker is No,
# purely for illustration purposes (it doesn't change any of the math).
np.random.seed(1)
tips = px.data.tips().sample(frac=1).reset_index(drop=True)
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 |
Let's suppose we choose squared loss, meaning that $h^* = \text{mean}(y)$.
mean_tip = tips['tip'].mean()
mean_tip
2.99827868852459
# Unfortunately, the code to visualize a scatter plot and a line
# in plotly is not all that concise.
fig = go.Figure()
fig.add_trace(go.Scatter(
x=tips['total_bill'],
y=tips['tip'],
mode='markers',
name='Original Data')
)
fig.add_trace(go.Scatter(
x=[0, 60],
y=[mean_tip, mean_tip],
mode='lines',
name='Constant Prediction (Mean)'
))
fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
xaxis_title='Total Bill', yaxis_title='Tip',
template=TEMPLATE)
fig.update_xaxes(range=[0, 60])
Let's compute the RMSE of our constant tip's predictions, and store it in a dictionary that we can refer to later on.
def rmse(actual, pred):
return np.sqrt(np.mean((actual - pred) ** 2))
rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)
rmse_dict
{'constant tip amount': 1.3807999538298952}
We can fit a simple linear model to predict tips as a function of total bill:
$$\text{predicted tip} = w_0 + w_1 \cdot \text{total bill}$$model = LinearRegression()
model.fit(X=tips[['total_bill']], y=tips['tip'])
LinearRegression()
fig.add_trace(go.Scatter(
x=[0, 60],
y=model.predict([[0], [60]]),
mode='lines',
name='Linear: Total Bill Only'
))
/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
all_preds = model.predict(tips[['total_bill']])
rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)
rmse_dict
{'constant tip amount': 1.3807999538298952, 'one feature: total bill': 1.0178504025697377}
The RMSE of our simple linear model is lower than that of our constant model, which means it does a better job at modeling the training data than our constant model.
Let's try using another feature – table size. Such a model would predict tips using:
$$\text{predicted tip} = w_0 + w_1 \cdot \text{total bill} + w_2 \cdot \text{table size}$$model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])
LinearRegression()
model_two.predict([[25, 4]])
/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
array([3.75716934])
What does this model look like?
Here, we must draw a 3D scatter plot and plane, with one axis for total bill, one axis for table size, and one axis for tip. The code below does this.
XX, YY = np.mgrid[0:50:2, 0:8:1]
Z = model_two.intercept_ + model_two.coef_[0] * XX + model_two.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')
fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=tips['total_bill'],
y=tips['size'],
z=tips['tip'], mode='markers', marker = {'color': '#656DF1'}))
fig.update_layout(scene = dict(
xaxis_title='Total Bill',
yaxis_title='Table Size',
zaxis_title='Tip'),
title='Tip vs. Total Bill and Table Size',
width=1000, height=800,
template=TEMPLATE)
How does our two-feature linear model stack up to our single feature linear model and our constant model?
rmse_dict['two features'] = rmse(
tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)
rmse_dict
{'constant tip amount': 1.3807999538298952, 'one feature: total bill': 1.0178504025697377, 'two features': 1.007256127114662}
.score
method of a LinearRegression
object¶Model objects in sklearn
that have already been fit have a score
method.
model_two.score(tips[['total_bill', 'size']], tips['tip'])
0.46786930879612565
That doesn't look like the RMSE... what is it? 🤔
all_preds
contains model_two
's predicted 'tip'
for every row in tips
.
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 |
all_preds = model_two.predict(tips[['total_bill', 'size']])
all_preds[:5]
array([1.14617248, 2.7952968 , 3.71198575, 2.37623251, 3.01595454])
Method 1: $R^2 = \frac{\text{var}(\text{predicted $y$ values})}{\text{var}(\text{actual $y$ values})}$
np.var(all_preds) / np.var(tips['tip'])
0.46786930879612576
Method 2: $R^2 = \left[ \text{correlation}(\text{predicted $y$ values}, \text{actual $y$ values}) \right]^2$
Note: By correlation here, we are referring to $r$, the same correlation coefficient you saw in DSC 10.
(np.corrcoef(all_preds, tips['tip'])) ** 2
array([[1. , 0.46786931], [0.46786931, 1. ]])
Method 3: LinearRegression.score
model_two.score(tips[['total_bill', 'size']], tips['tip'])
0.46786930879612565
All three methods provide the same result!
LinearRegression
summary¶Property | Example | Description |
---|---|---|
Initialize model parameters | lr = LinearRegression() |
Create (empty) linear regression model |
Fit the model to the data | lr.fit(X, y) |
Determines regression coefficients |
Use model for prediction | lr.predict(X_new) |
Uses regression line to make predictions |
Evaluate the model | lr.score(X, y) |
Calculates the $R^2$ of the LR model |
Access model attributes | lr.coef_ , lr.intercept_ |
Accesses the regression coefficients and intercept |
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 |
'tip'
, we've only used the existing numerical features in our dataset, 'total_bill'
and 'size'
.'sex'
, 'smoker'
, 'day'
, and 'time'
, for example. We can't use these features in their current form, because they're non-numeric.'col'
has $N$ unique values, $A_1$, $A_2$, ..., $A_N$. For each unique value $A_i$, we define the following feature function:'smoker'
¶For each unique value of 'smoker'
in our dataset, we must create a column for just that 'smoker'
. (Remember, 'smoker'
is 'Yes'
when the table was in the smoking section of the restaurant and 'No'
otherwise.)
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 |
tips['smoker'].value_counts()
No 151 Yes 93 Name: smoker, dtype: int64
(tips['smoker'] == 'Yes').astype(int).head()
0 1 1 0 2 1 3 0 4 0 Name: smoker, dtype: int64
for val in tips['smoker'].unique():
tips[f'smoker == {val}'] = (tips['smoker'] == val).astype(int)
tips.head()
total_bill | tip | sex | smoker | day | time | size | smoker == Yes | smoker == No | |
---|---|---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 | 1 | 0 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 0 | 1 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 | 1 | 0 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 | 0 | 1 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 | 0 | 1 |
Now that we've converted 'smoker'
to a numerical variable, we can use it as input in a regression model. Here's the model we'll try to fit:
Subtlety: There's no need to use both 'smoker == No'
and 'smoker == Yes'
. If we know the value of one, we already know the value of the other. We can use either one.
model_three = LinearRegression()
model_three.fit(tips[['total_bill', 'size', 'smoker == Yes']], tips['tip'])
LinearRegression()
The following cell gives us our $w^*$s:
model_three.intercept_, model_three.coef_
(0.7090155167346053, array([ 0.09388839, 0.18033156, -0.08343255]))
Thus, our trained linear model to predict tips given total bills, table sizes, and smoker status (yes or no) is:
$$\text{predicted tip} = 0.709 + 0.094 \cdot \text{total bill} + 0.180 \cdot \text{table size} - 0.083 \cdot \text{smoker == Yes}$$Our new fit model is:
$$\text{predicted tip} = 0.709 + 0.094 \cdot \text{total bill} + 0.180 \cdot \text{table size} - 0.083 \cdot \text{smoker == Yes}$$To visualize our data and linear model, we'd need 4 dimensions:
'smoker == Yes'
.Humans can't visualize in 4D, but there may be a solution. We know that 'smoker == Yes'
only has two possible values, 1 or 0, so let's look at those cases separately.
Case 1: 'smoker == Yes'
is 1, meaning that the table was in the smoking section.
Case 2: 'smoker == Yes'
is 0, meaning that the table was not in the smoking section.
Key idea: These are two parallel planes in 3D, with different $z$-intercepts!
Note that the two planes are very close to one another – you'll have to zoom in to see the difference.
XX, YY = np.mgrid[0:50:2, 0:8:1]
Z_0 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 0
Z_1 = model_three.intercept_ + model_three.coef_[0] * XX + model_three.coef_[1] * YY + model_three.coef_[2] * 1
plane_0 = go.Surface(x=XX, y=YY, z=Z_0, colorscale='Greens')
plane_1 = go.Surface(x=XX, y=YY, z=Z_1, colorscale='Purples')
fig = go.Figure(data=[plane_0, plane_1])
tips_0 = tips[tips['smoker'] == 'No']
tips_1 = tips[tips['smoker'] == 'Yes']
fig.add_trace(go.Scatter3d(x=tips_0['total_bill'],
y=tips_0['size'],
z=tips_0['tip'], mode='markers', marker = {'color': 'green'}))
fig.add_trace(go.Scatter3d(x=tips_1['total_bill'],
y=tips_1['size'],
z=tips_1['tip'], mode='markers', marker = {'color': 'purple'}))
fig.update_layout(scene = dict(
xaxis_title='Total Bill',
yaxis_title='Table Size',
zaxis_title='Tip'),
title='Tip vs. Total Bill and Table Size (Green = Non-Smoking Section, Purple = Smoking Section)',
width=1000, height=800,
showlegend=False,
template=TEMPLATE)
If we want to visualize in 2D, we need to pick a single feature to place on the $x$-axis.
fig = go.Figure()
fig.add_trace(go.Scatter(x=tips['total_bill'], y=tips['tip'],
mode='markers', name='Original Data'))
fig.add_trace(go.Scatter(x=tips['total_bill'], y=model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]),
mode='markers', name='Predicted Tips using Total Bill, Table Size, and Smoker Status'))
fig.update_layout(showlegend=True, template=TEMPLATE, title='Tip vs. Total Bill',
xaxis_title='Total Bill', yaxis_title='Tip')
Despite being a linear model, why doesn't this model look like a straight line?
rmse_dict['three features'] = rmse(tips['tip'],
model_three.predict(tips[['total_bill', 'size', 'smoker == Yes']]))
rmse_dict
{'constant tip amount': 1.3807999538298952, 'one feature: total bill': 1.0178504025697377, 'two features': 1.007256127114662, 'three features': 1.0064899786822128}
Adding 'smoker == Yes'
decreased the training RMSE of our model, but barely.
tips.head()
total_bill | tip | sex | smoker | day | time | size | smoker == Yes | smoker == No | |
---|---|---|---|---|---|---|---|---|---|
0 | 3.07 | 1.00 | Female | Yes | Sat | Dinner | 1 | 1 | 0 |
1 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 0 | 1 |
2 | 26.59 | 3.41 | Male | Yes | Sat | Dinner | 3 | 1 | 0 |
3 | 14.26 | 2.50 | Male | No | Thur | Lunch | 2 | 0 | 1 |
4 | 21.16 | 3.00 | Male | No | Thur | Lunch | 2 | 0 | 1 |
'smoker'
, but it required a for
-loop.'sex'
, 'smoker'
, 'day'
, and 'time'
– all at once, without using a for
-loop?sklearn.preprocessing
's OneHotEncoder
. More on this in the next lecture!UID | AGE | STATE | HAS_BOUGHT | REVIEW | \ | |
---|---|---|---|---|---|---|
74 | 32 | NY | True | "Meh." | | | ✩✩ |
42 | 50 | WA | True | "Worked out of the box..." | | | ✩✩✩✩ |
57 | 16 | CA | NULL | "Hella tots lit yo..." | | | ✩ |
... | ... | ... | ... | ... | | | ... |
(int) | (int) | (str) | (bool) | (str) | | | (str) |
'RATING'
using the above features.'UID'
was likely used to join the user information (e.g., 'AGE'
and 'STATE'
) with some reviews
dataset.'UID'
s are stored as numbers, the numerical value of a user's 'UID'
won't help us predict their 'RATING'
.'UID'
feature, our model will find whatever patterns it can between 'UID'
s and 'RATING'
s in the training (observed data).'UID'
and 'RATING'
, this will lead to worse model performance on unseen data (bad).'UID'
.There are certain scenarios where manually dropping features might be helpful:
'RATING'
s is so that we can predict 'RATING'
s for users who haven't actually made a 'RATING'
s yet.'RATING'
.'REVIEW'
s after entering 'RATING'
s, we shouldn't use 'REVIEW'
s as a feature.UID | AGE | STATE | HAS_BOUGHT | REVIEW | \ | |
---|---|---|---|---|---|---|
74 | 32 | NY | True | "Meh." | | | ✩✩ |
42 | 50 | WA | True | "Worked out of the box..." | | | ✩✩✩✩ |
57 | 16 | CA | NULL | "Hella tots lit yo..." | | | ✩ |
... | ... | ... | ... | ... | | | ... |
(int) | (int) | (str) | (bool) | (str) | | | (str) |
How do we encode the 'RATING'
column, an ordinal variable, as a quantitative variable?
order_values = ['✩', '✩✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩']
ordinal_enc = {y:x + 1 for (x, y) in enumerate(order_values)}
ordinal_enc
{'✩': 1, '✩✩': 2, '✩✩✩': 3, '✩✩✩✩': 4, '✩✩✩✩✩': 5}
ratings = pd.DataFrame().assign(RATING=['✩', '✩✩', '✩✩✩', '✩✩', '✩✩✩', '✩', '✩✩✩', '✩✩✩✩', '✩✩✩✩✩'])
ratings
RATING | |
---|---|
0 | ✩ |
1 | ✩✩ |
2 | ✩✩✩ |
3 | ✩✩ |
4 | ✩✩✩ |
5 | ✩ |
6 | ✩✩✩ |
7 | ✩✩✩✩ |
8 | ✩✩✩✩✩ |
ratings.replace(ordinal_enc)
RATING | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 2 |
4 | 3 |
5 | 1 |
6 | 3 |
7 | 4 |
8 | 5 |
UID | AGE | STATE | HAS_BOUGHT | REVIEW | \ | |
---|---|---|---|---|---|---|
74 | 32 | NY | True | "Meh." | | | ✩✩ |
42 | 50 | WA | True | "Worked out of the box..." | | | ✩✩✩✩ |
57 | 16 | CA | NULL | "Hella tots lit yo..." | | | ✩ |
... | ... | ... | ... | ... | | | ... |
(int) | (int) | (str) | (bool) | (str) | | | (str) |
How do we encode the 'STATE'
column, a nominal variable, as a quantitative variable? In other words, how do we turn 'STATE'
s into meaningful numbers?
The following dataset, built into the seaborn
plotting library, contains various information about (older) cars.
mpg = sns.load_dataset('mpg').dropna()
mpg.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
We really do mean old:
mpg['model_year'].value_counts()
73 40 78 36 76 34 75 30 82 30 70 29 79 29 72 28 77 28 81 28 71 27 80 27 74 26 Name: model_year, dtype: int64
Let's investigate the relationship between 'horsepower'
and 'mpg'
.
'horsepower'
and 'mpg'
¶# Note: To create a simple scatter plot, all you need is
# px.scatter(mpg, x='horsepower', y='mpg').
# We've used the more complicated go.Scatter approach here so that we can add
# other lines on top.
fig = go.Figure()
fig.add_trace(go.Scatter(
x=mpg['horsepower'],
y=mpg['mpg'],
mode='markers',
name='Original Data')
)
fig.update_layout(showlegend=True, title='MPG vs. Horsepower',
xaxis_title='Horsepower', yaxis_title='MPG',
template=TEMPLATE)
'horsepower'
and 'mpg'
, though it's not quite linear.'horsepower'
to predict 'mpg'
and see what happens.'mpg'
using 'horsepower'
¶car_model = LinearRegression()
car_model.fit(mpg[['horsepower']], mpg['mpg'])
LinearRegression()
What do our predictions look like?
fig.add_trace(go.Scatter(
x=[25, 225],
y=car_model.predict([[25], [225]]),
mode='lines',
name='Predicted MPG using Horsepower'
))
/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
Our regression line doesn't quite capture the curvature in the relationship between 'horsepower'
and 'mpg'
.
Let's compute the $R^2$ of car_model
on our training data, for reference:
car_model.score(mpg[['horsepower']], mpg['mpg'])
0.6059482578894348
The Tukey Mosteller Bulge Diagram helps us pick which transformations to apply to data in order to linearize it.
The bottom-left quadrant appears to match the shape of the scatter plot between 'horsepower'
and 'mpg'
the best – let's try taking the log
of 'horsepower'
($X$).
mpg['log hp'] = np.log(mpg['horsepower'])
What does our data look like now?
log_fig = go.Figure()
log_fig.add_trace(go.Scatter(
x=mpg['log hp'],
y=mpg['mpg'],
mode='markers',
name='Original Data')
)
log_fig.update_layout(showlegend=True, title='MPG vs. log(Horsepower)',
xaxis_title='log(Horsepower)', yaxis_title='MPG',
template=TEMPLATE)
'mpg'
using log('horsepower')
¶Let's fit another linear model.
car_model_log = LinearRegression()
car_model_log.fit(mpg[['log hp']], mpg['mpg'])
LinearRegression()
What do our predictions look like now?
log_fig.add_trace(go.Scatter(
x=[3.7, 5.5],
y=car_model_log.predict([[3.7], [5.5]]),
mode='lines',
name='Predicted MPG using log(Horsepower)'
))
/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
The fit looks a bit better! How about the $R^2$?
car_model_log.score(mpg[['log hp']], mpg['mpg'])
0.6683347641192137
Also a bit better!
What do our predictions look like on the original, non-transformed scatter plot? Let's see:
fig.add_trace(
go.Scatter(
x=mpg['horsepower'],
y=car_model_log.intercept_ + car_model_log.coef_[0] * np.log(mpg['horsepower']),
mode='markers', name='Predicted MPG using log(Horsepower)', marker_color='red'
)
)
Our predictions that used $\log(\text{Horsepower})$ as an input don't fall on a straight line. We shouldn't expect them to; the red dots come from:
$$\text{Predicted MPG} = 108.698 - 18.582 \cdot \log(\text{Horsepower})$$car_model_log.intercept_, car_model_log.coef_
(108.69970699574486, array([-18.58218476]))
Until now, feature transformations we've discussed so far have involved converting categorical variables into quantitative variables. However, our log transformation was an example of transforming a quantitative variable into a new quantitative variable; this practice is called quantitative scaling.
LinearRegression
class in sklearn.linear_model
provides an implementation of least squares linear regression that works with multiple features.sklearn
directly.sklearn
Pipeline
s to engineer features and fit models all through a single object.