import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'


# The dataset is built into plotly (and seaborn)!
tips = px.data.tips()
tips


tips.plot(kind='scatter', 
          x='total_bill', y='tip',
          title='Tip vs. Total Bill',
          template=TEMPLATE)


tips.plot(kind='hist', 
          x='total_bill', 
          title='Distribution of Total Bill',
          nbins=50,
          template=TEMPLATE)


tips.plot(kind='hist', 
          x='tip', 
          title='Distribution of Tip',
          nbins=50,
          template=TEMPLATE)


mean_tip = tips['tip'].mean()
mean_tip

2.99827868852459


# Unfortunately, the code to visualize a scatter plot and a line
# in plotly is not all that concise.
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=tips['total_bill'], 
    y=tips['tip'], 
    mode='markers',
    name='Original Data')
)

fig.add_trace(go.Scatter(
    x=[0, 60],
    y=[mean_tip, mean_tip],
    mode='lines',
    name='Constant Prediction (Mean)'
))

fig.update_layout(showlegend=True, title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip',
                  template=TEMPLATE)
fig.update_xaxes(range=[0, 60])


np.mean((tips['tip'] - mean_tip) ** 2)

1.9066085124966412


# The same! A fact from 40A.
np.var(tips['tip'])

1.9066085124966412


def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))


rmse(tips['tip'], mean_tip)

1.3807999538298954


rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)
rmse_dict

{'constant tip amount': 1.3807999538298954}


tips.head()


from sklearn.linear_model import LinearRegression


LinearRegression?


model = LinearRegression()

# Note that there are two arguments to fit – X and y!
# (It is not necessary to write X= and y=)
model.fit(X=tips[['total_bill']], y=tips['tip'])

LinearRegression()


model.intercept_, model.coef_

(0.9202696135546731, array([0.10502452]))


fig.add_trace(go.Scatter(
    x=[0, 60],
    y=model.predict([[0], [60]]),
    mode='lines',
    name='Linear: Total Bill Only'
))

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names


model.predict([[15]])

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([2.49563737])


# The input to model.predict **must** be a 2D list/array.
model.predict([[15],
               [4],
               [100]])

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([ 2.49563737,  1.34036768, 11.42272135])


model.predict(np.array(
    [15, 4, 100]
).reshape(-1, 1))

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([ 2.49563737,  1.34036768, 11.42272135])


all_preds = model.predict(tips[['total_bill']])


rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)
rmse_dict

{'constant tip amount': 1.3807999538298954,
 'one feature: total bill': 1.0178504025697377}


tips.head()


model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()


model_two.intercept_, model_two.coef_

(0.6689447408125031, array([0.09271334, 0.19259779]))


model_two.predict([[25, 4]])

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([3.75716934])


XX, YY = np.mgrid[0:50:2, 0:8:1]
Z = model_two.intercept_ + model_two.coef_[0] * XX + model_two.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')

fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=tips['total_bill'], 
                           y=tips['size'], 
                           z=tips['tip'], mode='markers', marker = {'color': '#656DF1'}))

fig.update_layout(scene = dict(
    xaxis_title='total bill',
    yaxis_title='table size',
    zaxis_title='tip'),
  title='Tip vs. Total Bill and Table Size',
    width=1000, height=800)


rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)


rmse_dict

{'constant tip amount': 1.3807999538298954,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.007256127114662}

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4
...	...	...	...	...	...	...	...
239	29.03	5.92	Male	No	Sat	Dinner	3
240	27.18	2.00	Female	Yes	Sat	Dinner	2
241	22.67	2.00	Male	Yes	Sat	Dinner	2
242	17.82	1.75	Male	No	Sat	Dinner	2
243	18.78	3.00	Female	No	Thur	Dinner	2

`'total_bill'`	`'tip'`
Right skewed	Right skewed
Mean around $20	Mean around $3
Mode around $16	Possibly bimodal at \$2 and \$3?
No particularly large bills	Large outliers?

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 20 – Modeling and Linear Regression¶

DSC 80, Spring 2023¶

Agenda¶

Modeling¶

Reflection¶

Modeling¶

Goals of modeling¶

Features¶

Example: Restaurant tips 🧑‍🍳¶

About the data¶

Predicting tips¶

Exploratory data analysis (EDA)¶

Visualizing distributions¶

Observations¶

Model #1: Constant¶

Estimating $h^{\text{true}}$¶

Empirical risk minimization¶

The mean tip¶

The quality of predictions¶

Root mean squared error¶

Computing and storing the RMSE¶

Model #2: Simple linear regression using total bill¶

Recap: Simple linear regression¶

Empirical risk minimization, by hand¶

Regression in `sklearn`¶

`sklearn`¶

The `LinearRegression` class¶

Fitting a simple linear model¶

Making predictions¶

Comparing models¶

Model #3: Multiple linear regression using total bill and table size¶

Multiple linear regression¶

Plane of best fit ✈️¶

Comparing models, again¶

Conclusion¶

Summary, next time¶

Summary¶

Next time¶

Lecture 20 – Modeling and Linear Regression¶

DSC 80, Spring 2023¶

Agenda¶

Modeling¶

Reflection¶

Modeling¶

Goals of modeling¶

Features¶

Example: Restaurant tips 🧑‍🍳¶

About the data¶

Predicting tips¶

Exploratory data analysis (EDA)¶

Visualizing distributions¶

Observations¶

Model #1: Constant¶

Estimating $h^{\text{true}}$¶

Empirical risk minimization¶

The mean tip¶

The quality of predictions¶

Root mean squared error¶

Computing and storing the RMSE¶

Model #2: Simple linear regression using total bill¶

Recap: Simple linear regression¶

Empirical risk minimization, by hand¶

Regression in sklearn¶

sklearn¶

The LinearRegression class¶

Fitting a simple linear model¶

Making predictions¶

Comparing models¶

Model #3: Multiple linear regression using total bill and table size¶

Multiple linear regression¶

Plane of best fit ✈️¶

Comparing models, again¶

Conclusion¶

Summary, next time¶

Summary¶

Next time¶

Regression in `sklearn`¶

`sklearn`¶

The `LinearRegression` class¶