# Run this cell to set up packages for lecture.
from lec26_imports import *

non_linear()

def predicted(df, x, y):
    m = slope(df, x, y)
    b = intercept(df, x, y)
    return m * df.get(x) + b

def residual(df, x, y):
    return df.get(y) - predicted(df, x, y)

galton = bpd.read_csv('data/galton.csv')

male_children = galton[galton.get('gender') == 'male']
mom_son = bpd.DataFrame().assign(mom = male_children.get('mother'), 
                                 son = male_children.get('childHeight'))

mom_son_predictions = mom_son.assign(predicted=predicted(mom_son, 'mom', 'son'),
                                     residuals=residual(mom_son, 'mom', 'son'),
                                    )

plot_regression_line(mom_son_predictions, 'mom', 'son', resid=True)

Correlation: 0.3230049836849053

mom_son_predictions.plot(kind='scatter', x='mom', y='residuals', s=50, c='purple', figsize=(10, 5), label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for predicting son\'s height based on mother\'s height')
plt.legend();

hybrid = bpd.read_csv('data/hybrid.csv')
mpg_price = hybrid.assign(
    predicted=predicted(hybrid, 'mpg', 'price'),
    residuals=residual(hybrid, 'mpg', 'price')
)
mpg_price

# Plot of the original data and regression line.
plot_regression_line(hybrid, 'mpg', 'price');
print('Correlation:', calculate_r(hybrid, 'mpg', 'price'))

Correlation: -0.5318263633683786

# Residual plot.
mpg_price.plot(kind='scatter', x='mpg', y='residuals', figsize=(10, 5), s=50, color='purple', label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for regression between mpg and price')
plt.legend();

accel_mpg = hybrid.assign(
    predicted=predicted(hybrid, 'acceleration', 'mpg'),
    residuals=residual(hybrid, 'acceleration', 'mpg')
)
accel_mpg

# Plot of the original data and regression line.
plot_regression_line(accel_mpg, 'acceleration', 'mpg')
print('Correlation:', calculate_r(accel_mpg, 'acceleration', 'mpg'))

Correlation: -0.5060703843771186

# Residual plot.
accel_mpg.plot(kind='scatter', x='acceleration', y='residuals', figsize=(10, 5), s=50, color='purple', label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for regression between acceleration and mpg')
plt.legend();

dino = bpd.read_csv('data/Datasaurus_data.csv')
dino

calculate_r(dino, 'x', 'y')

-0.06447185270095163

slope(dino, 'x', 'y')

-0.10358250243265595

intercept(dino, 'x', 'y')

53.452978449229235

plot_regression_line(dino, 'x', 'y');

# Step 1: Resample the dataset.
resample = mom_son.sample(mom_son.shape[0], replace=True)

# Step 2: Compute the slope and intercept of the regression line for that resample.
plot_regression_line(resample, 'mom', 'son', alpha=0.5)

plt.ylim([60, 80])
plt.xlim([57, 72]);

m_orig = slope(mom_son, 'mom', 'son')
b_orig = intercept(mom_son, 'mom', 'son')

m_boot = np.array([])
b_boot = np.array([])

for i in np.arange(5000):
    # Step 1: Resample the dataset.
    resample = mom_son.sample(mom_son.shape[0], replace=True)
    
    # Step 2: Compute the slope and intercept of the regression line for that resample.
    m = slope(resample, 'mom', 'son')
    b = intercept(resample, 'mom', 'son')
    m_boot = np.append(m_boot, m)
    b_boot = np.append(b_boot, b)

pred_orig = m_orig * 68 + b_orig
pred_orig

70.68219686848825

m_orig

0.3650611602425757

m_boot

array([0.33, 0.36, 0.42, ..., 0.33, 0.33, 0.4 ])

b_orig

45.8580379719931

b_boot

array([48.18, 46.26, 42.22, ..., 48.02, 48.13, 43.57])

boot_preds = m_boot * 68 + b_boot
boot_preds

array([70.74, 70.53, 70.98, ..., 70.6 , 70.88, 70.8 ])

l = np.percentile(boot_preds, 2.5)
r = np.percentile(boot_preds, 97.5)
[l, r]

[70.21553543791681, 71.15983764737595]

bpd.DataFrame().assign(
    predictions=boot_preds
).plot(kind='hist', density=True, bins=20, figsize=(10, 5), ec='w', title='Interval of predicted heights for the son of a 68 inch tall mother')
plt.plot([l,r],[0.01,0.01], c='gold', linewidth=10, zorder=1, label='95% prediction interval')
plt.legend();

draw_many_lines(m_boot, b_boot)

slider_widget()

HBox(children=(IntSlider(value=64, description="mom's height", max=78, min=50),))

Output()

	vehicle	year	price	acceleration	mpg	class	predicted	residuals
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact	32609.64	-8099.90
1	Tino	2000	35354.97	8.20	54.10	Compact	19278.39	16076.58
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact	28487.75	-1655.50
...	...	...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize	30803.06	2146.94
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize	30803.06	7896.94
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact	37032.62	2112.38

	vehicle	year	price	acceleration	mpg	class	predicted	residuals
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact	43.29	-2.03
1	Tino	2000	35354.97	8.20	54.10	Compact	41.90	12.20
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact	42.33	2.90
...	...	...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize	35.17	7.83
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize	35.17	7.83
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact	36.40	0.60

	x	y
0	55.38	97.18
1	51.54	96.03
2	46.15	94.49
...	...	...
139	50.00	95.77
140	47.95	95.00
141	44.10	92.69

Lecture 26 – Residuals and Inference¶

DSC 10, Spring 2024¶

Announcements¶

Agenda¶

Residuals¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Example: Predicting a son's height from his mother's height 👵👨 📏¶

Residual plots¶

The residual plot for a non-linear association 🚗¶

Issue: Patterns in the residual plot¶

Another example: `'mpg'` and `'acceleration'` ⛽¶

Issue: Uneven vertical spread¶

Example: Anscombe's quartet¶

Example: The Datasaurus Dozen 🦖¶

Inference for regression¶

Another perspective on regression¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Prediction intervals¶

Bootstrapping the scatter plot of mother/son heights¶

Bootstrapping predictions: mother/son heights¶

If a mother is 68 inches tall, how tall do we predict her son to be?¶

How different could our prediction have been, for all inputs?¶

Prediction interval width vs. mother's height¶

Summary, next time¶

Summary¶

Next time¶

Lecture 26 – Residuals and Inference¶

DSC 10, Spring 2024¶

Announcements¶

Agenda¶

Residuals¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Example: Predicting a son's height from his mother's height 👵👨 📏¶

Residual plots¶

The residual plot for a non-linear association 🚗¶

Issue: Patterns in the residual plot¶

Another example: 'mpg' and 'acceleration' ⛽¶

Issue: Uneven vertical spread¶

Example: Anscombe's quartet¶

Example: The Datasaurus Dozen 🦖¶

Inference for regression¶

Another perspective on regression¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Prediction intervals¶

Bootstrapping the scatter plot of mother/son heights¶

Bootstrapping predictions: mother/son heights¶

If a mother is 68 inches tall, how tall do we predict her son to be?¶

How different could our prediction have been, for all inputs?¶

Prediction interval width vs. mother's height¶

Summary, next time¶

Summary¶

Next time¶

Another example: `'mpg'` and `'acceleration'` ⛽¶

Concept Check ✅ – Answer at cc.dsc10.com ¶