from dsc80_utils import *

# The dataset is built into plotly!
tips = px.data.tips()
tips

fig = tips.plot(kind='scatter', x='total_bill', y='tip', title='Tip vs. Total Bill')
fig.update_layout(xaxis_title='Total Bill', yaxis_title='Tip')

fig = tips.plot(kind='hist', x='tip', title='Distribution of Tip', nbins=20)
fig.update_layout(xaxis_title='Tip', yaxis_title='Frequency')

mean_tip = tips['tip'].mean()
mean_tip

2.9982786885245902

fig = px.scatter(tips, x='total_bill', y='tip')
fig.add_hline(mean_tip, line_width=3, line_color='orange', opacity=1)
fig.update_layout(title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

np.mean((tips['tip'] - mean_tip) ** 2)

1.9066085124966428

# The same! A fact from 40A.
np.var(tips['tip'])

1.9066085124966428

def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

rmse(tips['tip'], mean_tip)

1.3807999538298958

rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)
rmse_dict

{'constant tip amount': 1.3807999538298958}

tips.head()

from sklearn.linear_model import LinearRegression

LinearRegression?

model = LinearRegression()

# Note that there are two arguments to fit – X and y!
# (It is not necessary to write X= and y=)
model.fit(X=tips[['total_bill']], y=tips['tip'])

LinearRegression()

model.intercept_, model.coef_[0]

(0.9202696135546731, 0.10502451738435337)

line_pts = pd.DataFrame({'total_bill': [0, 60]})

fig = px.scatter(tips, x='total_bill', y='tip')
fig.add_trace(go.Scatter(
    x=line_pts['total_bill'],
    y=[mean_tip, mean_tip],
    mode='lines',
    name='Constant Model (Mean Tip)'
))
fig.add_trace(go.Scatter(
    x=line_pts['total_bill'],
    y=model.predict(line_pts),
    mode='lines',
    name='Linear Model: Total Bill Only'
))
fig.update_layout(title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', 
                  yaxis_title='Tip')

model.predict([[15]])

/Users/sam/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:450: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([2.5])

# Since we trained on a DataFrame, the input to model.predict should also
# be a DataFrame. To avoid having to do this, we can use .to_numpy()
# when specifying X= and y=.
test_points = pd.DataFrame({'total_bill': [15, 4, 100]})
model.predict(test_points)

array([ 2.5 ,  1.34, 11.42])

all_preds = model.predict(tips[['total_bill']])

rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)
rmse_dict

{'constant tip amount': 1.3807999538298958,
 'one feature: total bill': 1.0178504025697377}

rmse_dict

{'constant tip amount': 1.3807999538298958,
 'one feature: total bill': 1.0178504025697377}

pd.DataFrame({'rmse': rmse_dict.values()}, index=rmse_dict.keys())

tips.head()

model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()

model_two.intercept_, model_two.coef_

(0.6689447408125031, array([0.09, 0.19]))

test_pts = pd.DataFrame({'total_bill': [25], 'size': [4]})
model_two.predict(test_pts)

array([3.76])

XX, YY = np.mgrid[0:50:2, 0:8:1]
Z = model_two.intercept_ + model_two.coef_[0] * XX + model_two.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')

fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=tips['total_bill'], 
                           y=tips['size'], 
                           z=tips['tip'], mode='markers',
                           marker={'color': '#656DF1', 'size': 5}))

fig.update_layout(scene=dict(xaxis_title='Total Bill',
                             yaxis_title='Table Size',
                             zaxis_title='Tip'),
                  title='Tip vs. Total Bill and Table Size',
                  width=500, height=500)

rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)

pd.DataFrame({'rmse': rmse_dict.values()}, index=rmse_dict.keys())

# Let's start with the single-variable model:
with_resid = tips.assign(**{
    'Predicted Tip': model.predict(tips[['total_bill']]),
    'Residual': tips['tip'] - model.predict(tips[['total_bill']]),
})
fig = px.scatter(with_resid, x='Predicted Tip', y='Residual')
fig.add_hline(0, line_width=2, opacity=1).update_layout(title='Residual Plot (Simple Linear Model)')

# What about the two-variable model?
with_resid = tips.assign(**{
    'Predicted Tip': model_two.predict(tips[['total_bill', 'size']]),
    'Residual': tips['tip'] - model_two.predict(tips[['total_bill', 'size']]),
})
fig = px.scatter(with_resid, x='Predicted Tip', y='Residual')
fig.add_hline(0, line_width=2, opacity=1).update_layout(title='Residual Plot (Multiple Regression)')

grades = pd.read_csv('data/gradesW4315.csv')[['midterm', 'final']]
grades

tips.head()

	track_name
0	i hate you i love you i hate that i love you
1	love me like a love song
2	love you better
3	hate sosa

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
...	...	...	...	...	...	...	...
241	22.67	2.00	Male	Yes	Sat	Dinner	2
242	17.82	1.75	Male	No	Sat	Dinner	2
243	18.78	3.00	Female	No	Thur	Dinner	2

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Property	Example	Description
Initialize model parameters	`lr = LinearRegression()`	Create (empty) linear regression model
Fit the model to the data	`lr.fit(X, y)`	Determines regression coefficients
Use model for prediction	`lr.predict(X_new)`	Uses regression line to make predictions
Evaluate the model	`lr.score(X, y)`	Calculates the $R^2$ of the LR model
Access model attributes	`lr.coef_`, `lr.intercept_`	Accesses the regression coefficients and intercept

	rmse
constant tip amount	1.38
one feature: total bill	1.02
two features	1.01

	midterm	final
0	80	103
1	53	79
2	91	122
...	...	...
49	77	98
50	70	134
51	75	99

Lecture 13 – Linear Regression¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Midterm Survey Results¶

Some Thoughts Going Forward¶

Question 🤔 (Answer at q.dsc80.com)

Question 🤔 (Answer at q.dsc80.com)

Modeling¶

Reflection¶

Modeling¶

Philosophy¶

Goals of modeling¶

Features¶

Example: Restaurant tips 🧑‍🍳¶

About the data¶

Predicting tips¶

Exploratory data analysis¶

Model #1: Constant¶

Estimating $h^{\text{true}}$¶

Looking at the data¶

Empirical risk minimization¶

The mean tip¶

The quality of predictions¶

Root mean squared error¶

Computing and storing the RMSE¶

Question 🤔 (Answer at q.dsc80.com)

Model #2: Simple linear regression using total bill¶

Recap: Simple linear regression¶

Empirical risk minimization, by hand¶

Regression in sklearn¶

sklearn¶

The LinearRegression class¶

Fitting a simple linear model¶

Making predictions¶

Comparing models¶

💡 Pro-Tip: Making a DF just to display¶

Question 🤔 (Answer at q.dsc80.com)

Model #3: Multiple linear regression using total bill and table size¶

Multiple linear regression¶

Plane of best fit ✈️¶

Comparing models, again¶

Residual plots¶

Question 🤔 (Answer at q.dsc80.com)

Summary, next time¶

Summary¶

LinearRegression summary¶

Next time¶

Regression in `sklearn`¶

`sklearn`¶

The `LinearRegression` class¶

`LinearRegression` summary¶