# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from scipy import stats
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from ipywidgets import widgets
from IPython.display import clear_output, display

import warnings
warnings.filterwarnings('ignore')

def standard_units(any_numbers):
    "Convert a sequence of numbers to standard units."
    return (any_numbers - any_numbers.mean()) / np.std(any_numbers)

def standardize(df):
    """Return a DataFrame in which all columns of df are converted to standard units."""
    df_su = bpd.DataFrame()
    for column in df.columns:
        df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
    return df_su

def correlation(df, x, y):
    '''Computes the correlation between column x and column y of df.'''
    return (standard_units(df.get(x)) * standard_units(df.get(y))).mean()

def slope(df, x, y):
    '''Returns the slope of the regression line between columns x and y in df (in original units).'''
    r = correlation(df, x, y)
    return r * np.std(df.get(y)) / np.std(df.get(x))

def intercept(df, x, y):
    '''Returns the intercept of the regression line between columns x and y in df (in original units).'''
    return df.get(y).mean() - slope(df, x, y) * df.get(x).mean()

# All of the following code is for visualization.
def plot_regression_line(df, x, y, margin=.02, alpha=1):
    '''Computes the slope and intercept of the regression line between columns x and y in df (in original units) and plots it.'''
    m = slope(df, x, y)
    b = intercept(df, x, y)
    
    df.plot(kind='scatter', x=x, y=y, s=50, figsize=(10, 5), label='original data', alpha=alpha)
    left = df.get(x).min()*(1 - margin)
    right = df.get(x).max()*(1 + margin)
    domain = np.linspace(left, right, 10)
    plt.plot(domain, m*domain + b, color='orange', label='regression line', lw=4)
    plt.suptitle(format_equation(m, b), fontsize=18)
    plt.legend();

def format_equation(m, b):
    if b > 0:
        return r'$y = %.2fx + %.2f$' % (m, b)
    elif b == 0:
        return r'$y = %.2fx' % m
    else:
        return r'$y = %.2fx %.2f$' % (m, b)


np.random.seed(23)
x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5) + np.random.normal(size=25), 
    y=np.arange(-6, 6.1, 0.5)**2 + np.random.normal(size=25)
)
plot_regression_line(x2, 'x', 'y')


def predicted(df, x, y):
    m = slope(df, x, y)
    b = intercept(df, x, y)
    return m * df.get(x) + b

def residual(df, x, y):
    return df.get(y) - predicted(df, x, y)


galton = bpd.read_csv('data/galton.csv')

male_children = galton[galton.get('gender') == 'male']
mom_son = bpd.DataFrame().assign(mom = male_children.get('mother'), 
                                 son = male_children.get('childHeight'))

mom_son_predictions = mom_son.assign(
    predicted=predicted(mom_son, 'mom', 'son'),
    residuals=residual(mom_son, 'mom', 'son'),
)


plot_regression_line(mom_son_predictions, 'mom', 'son')

idx = np.random.randint(0, mom_son_predictions.shape[0], size=50)
for i, k in enumerate(idx):
    x = mom_son_predictions.get('mom').iloc[k]
    y = mom_son_predictions.get('son').iloc[k]
    p = mom_son_predictions.get('predicted').iloc[k]
    plt.plot([x,x], [y,p], linewidth=3, color='purple', label='residuals' if i==0 else None)
plt.legend();
print('Correlation:', correlation(mom_son, 'mom', 'son'))

Correlation: 0.32300498368490554


mom_son_predictions.plot(kind='scatter', x='mom', y='residuals', s=50, c='purple', figsize=(10, 5), label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for predicting son\'s height based on mother\'s height')
plt.legend();


hybrid = bpd.read_csv('data/hybrid.csv')
mpg_msrp = hybrid.assign(
    predicted=predicted(hybrid, 'mpg', 'msrp'),
    residuals=residual(hybrid, 'mpg', 'msrp')
)
mpg_msrp


# Plot of the original data and regression line.
plot_regression_line(hybrid, 'mpg', 'msrp');
print('Correlation:', correlation(hybrid, 'mpg', 'msrp'))

Correlation: -0.5318263633683789


# Residual plot.
mpg_msrp.plot(kind='scatter', x='mpg', y='residuals', figsize=(10, 5), s=50, color='purple', label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for regression between mpg and msrp')
plt.legend();


accel_mpg = hybrid.assign(
    predicted=predicted(hybrid, 'acceleration', 'mpg'),
    residuals=residual(hybrid, 'acceleration', 'mpg')
)
accel_mpg


# Plot of the original data and regression line.
plot_regression_line(accel_mpg, 'acceleration', 'mpg')
print('Correlation:', correlation(accel_mpg, 'acceleration', 'mpg'))

Correlation: -0.5060703843771185


# Residual plot.
accel_mpg.plot(kind='scatter', x='acceleration', y='residuals', figsize=(10, 5), s=50, color='purple', label='residuals')
plt.axhline(0, linewidth=3, color='k', label='y = 0')
plt.title('Residual plot for regression between acceleration and msrp')
plt.legend();


# Step 1: Resample the dataset.
resampled = mom_son.sample(mom_son.shape[0], replace=True)

# Step 2: Compute the slope and intercept of the regression line for that resample.
plot_regression_line(resampled, 'mom', 'son', alpha=0.5)

plt.ylim([60, 80])
plt.xlim([57, 72]);


m_orig = slope(mom_son, 'mom', 'son')
b_orig = intercept(mom_son, 'mom', 'son')


boot_slopes = np.array([])
boot_intercepts = np.array([])

for i in np.arange(5000):
    # Step 1: Resample the dataset.
    resample = mom_son.sample(mom_son.shape[0], replace=True)
    
    # Step 2: Compute the slope and intercept of the regression line for that resample.
    m = slope(resample, 'mom', 'son')
    b = intercept(resample, 'mom', 'son')
    boot_slopes = np.append(boot_slopes, m)
    boot_intercepts = np.append(boot_intercepts, b)


mom = 68


pred_orig = m_orig * mom + b_orig
pred_orig

70.68219686848828


boot_preds = boot_slopes * mom + boot_intercepts
boot_preds

array([70.74, 70.53, 70.98, ..., 70.6 , 70.88, 70.8 ])


l = np.percentile(boot_preds, 2.5)
r = np.percentile(boot_preds, 97.5)
[l, r]

[70.21553543791688, 71.159837647376]


bpd.DataFrame().assign(
    predictions=boot_preds
).plot(kind='hist', density=True, bins=20, figsize=(10, 5), ec='w', title='Interval of predicted heights for the son of a 68 inch tall mother')
plt.plot([l,r],[0.01,0.01], c='gold', linewidth=10, zorder=1, label='95% prediction interval')
plt.legend();


# Don't worry about the code for this.
plt.figure(figsize=(10, 5))
x = np.arange(50, 80)
ys = []
for i, (m, b) in enumerate(zip(boot_slopes[:50], boot_intercepts)):
    ys.append(m * x + b)
    fig = plt.plot(x, m * x + b, linewidth=1)


# Don't worry about the code for this.

def pred_interval(mom):
    plt.figure(figsize=(10, 5))
    x = np.arange(50, 80)
    ys = []
    for i, (m, b) in enumerate(zip(boot_slopes[:50], boot_intercepts)):
        ys.append(m * x + b)
        plt.plot(x, m * x + b, linewidth=1, alpha=0.1)  
        
    boot_preds = boot_slopes * mom + boot_intercepts
    l = np.percentile(boot_preds, 2.5)
    r = np.percentile(boot_preds, 97.5)
    plt.plot([mom, mom], [l, r], linewidth=5, color='#eb7e35', label='95% prediction interval')
    plt.xlim(50, 80)
    plt.ylim(62, 77)
    plt.title(f'95% prediction interval for the height of a son whose mother is {mom} inches tall: {[np.round(l, 3), np.round(r, 3)]}')
    plt.legend()
    plt.show()

def slider_widget():
    mom_slider = widgets.IntSlider(value=64, min=50, max=78, step=1, description="mom's height")
    ui = widgets.HBox([mom_slider])
    out = widgets.interactive_output(pred_interval, {'mom': mom_slider})
    display(ui, out)

slider_widget()

	vehicle	year	msrp	acceleration	mpg	class	predicted	residuals
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact	32609.64	-8099.90
1	Tino	2000	35354.97	8.20	54.10	Compact	19278.39	16076.58
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact	28487.75	-1655.50
...	...	...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize	30803.06	2146.94
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize	30803.06	7896.94
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact	37032.62	2112.38

	vehicle	year	msrp	acceleration	mpg	class	predicted	residuals
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact	43.29	-2.03
1	Tino	2000	35354.97	8.20	54.10	Compact	41.90	12.20
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact	42.33	2.90
...	...	...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize	35.17	7.83
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize	35.17	7.83
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact	36.40	0.60

Lecture 26 – Residuals and Inference¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Residuals¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Calculating residuals¶

Example: Predicting a son's height from his mother's height 👵👨 📏¶

Residual plots¶

The residual plot for a non-linear association 🚗¶

Issue: Patterns in the residual plot¶

Another example: `'mpg'` and `'acceleration'` ⛽¶

Issue: Uneven vertical spread¶

Example: Anscombe's quartet¶

Example: The Datasaurus Dozen 🦖¶

Inference for regression¶

Another perspective on regression¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Prediction intervals¶

Bootstrapping the scatter plot of mother/son heights¶

Bootstrapping predictions: mother/son heights¶

If a mother is 68 inches tall, how tall do we predict her son to be?¶

How different could our prediction have been, for all inputs?¶

Prediction interval width vs. mother's height¶

Summary, next time¶

Summary¶

Next time¶

Lecture 26 – Residuals and Inference¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Residuals¶

Quality of fit¶

Example: Non-linear data¶

Residuals¶

Calculating residuals¶

Example: Predicting a son's height from his mother's height 👵👨 📏¶

Residual plots¶

The residual plot for a non-linear association 🚗¶

Issue: Patterns in the residual plot¶

Another example: 'mpg' and 'acceleration' ⛽¶

Issue: Uneven vertical spread¶

Example: Anscombe's quartet¶

Example: The Datasaurus Dozen 🦖¶

Inference for regression¶

Another perspective on regression¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Prediction intervals¶

Bootstrapping the scatter plot of mother/son heights¶

Bootstrapping predictions: mother/son heights¶

If a mother is 68 inches tall, how tall do we predict her son to be?¶

How different could our prediction have been, for all inputs?¶

Prediction interval width vs. mother's height¶

Summary, next time¶

Summary¶

Next time¶

Another example: `'mpg'` and `'acceleration'` ⛽¶

Concept Check ✅ – Answer at cc.dsc10.com ¶