# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = (10, 5)

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

import warnings
warnings.filterwarnings('ignore')

# Demonstration code
from IPython.display import display
import ipywidgets as widgets
import plotly.express as px

def r_scatter(r):
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r * x + (np.sqrt(1 - r ** 2)) * z
    plt.scatter(x, y)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.title(f'$r={r}$')
    
def show_scatter_grid():
    plt.subplots(1, 4, figsize=(10, 2))
    for i, r in enumerate([-1, -2/3, -1/3, 0]):
        plt.subplot(1, 4, i+1)
        r_scatter(r)
        plt.title(f'r = {np.round(r, 2)}')
    plt.show()
    plt.subplots(1, 4, figsize=(10, 2))
    for i, r in enumerate([1, 2/3, 1/3]):
        plt.subplot(1, 4, i+1)
        r_scatter(r)
        plt.title(f'$r = {np.round(r, 2)}$')
    plt.subplot(1, 4, 4)
    plt.axis('off')
    plt.show()


hybrid = bpd.read_csv('data/hybrid.csv')
hybrid


hybrid.plot(kind='scatter', x='acceleration', y='price');


hybrid.plot(kind='scatter', x='mpg', y='price');


px.scatter(hybrid.to_df(), x='mpg', y='price', hover_name='vehicle')


show_scatter_grid()


widgets.interact(r_scatter, r=(-1, 1, 0.05));

interactive(children=(FloatSlider(value=0.0, description='r', max=1.0, min=-1.0, step=0.05), Output()), _dom_c…


def standard_units(col):
    return (col - col.mean()) / np.std(col)


def calculate_r(df, x, y):
    '''Returns the average value of the product of x and y, 
       when both are measured in standard units.'''
    x_su = standard_units(df.get(x))
    y_su = standard_units(df.get(y))
    return (x_su * y_su).mean()


hybrid.plot(kind='scatter', x='acceleration', y='price');


calculate_r(hybrid, 'acceleration', 'price')

0.695577899691398


hybrid.plot(kind='scatter', x='mpg', y='price');


calculate_r(hybrid, 'mpg', 'price')

-0.5318263633683789


hybrid.plot(kind='scatter', x='mpg', y='price', title='price (dollars) vs. mpg');


hybrid.assign(
    price_yen=hybrid.get('price') * 147.20, # The USD to Japanese Yen exchange rate as of today morning.
    kpg=hybrid.get('mpg') * 1.6             # 1 mile is 1.6 kilometers.
).plot(kind='scatter', x='kpg', y='price_yen', title='price (yen) vs. kpg');


def standardize(df):
    """Return a DataFrame in which all columns of df are converted to standard units."""
    df_su = bpd.DataFrame()
    for column in df.columns:
        # This uses syntax that is out of scope; don't worry about how it works.
        df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
    return df_su

hybrid_su = standardize(hybrid.get(['acceleration', 'mpg', 'price']))
hybrid_su


hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)')
plt.axvline(0, color='black')
plt.axhline(0, color='black');


calculate_r(hybrid, 'acceleration', 'price')

0.695577899691398


hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');


calculate_r(hybrid, 'mpg', 'price')

-0.5318263633683789


# Once again, run this cell and play with the slider that appears!
widgets.interact(r_scatter, r=(-1, 1, 0.05));

interactive(children=(FloatSlider(value=0.0, description='r', max=1.0, min=-1.0, step=0.05), Output()), _dom_c…


x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5), 
    y=np.arange(-6, 6.1, 0.5) ** 2
)
x2.plot(kind='scatter', x='x', y='y');


galton = bpd.read_csv('data/galton.csv')
galton


male_children = galton[galton.get('gender') == 'male'].reset_index()
mom_son = bpd.DataFrame().assign(mom=male_children.get('mother'), son=male_children.get('childHeight'))
mom_son


mom_son.plot(kind='scatter', x='mom', y='son');


r_mom_son = calculate_r(mom_son, 'mom', 'son')
r_mom_son

0.32300498368490554


mom_son_su = standardize(mom_son)

def constant_prediction(prediction):
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=f'Predicting a height of {prediction} SUs for all sons', figsize=(10, 5));
    plt.axhline(prediction, color='orange', lw=4);
    plt.xlim(-3, 3)
    plt.show()

prediction = widgets.FloatSlider(value=-3, min=-3,max=3,step=0.5, description='prediction')
ui = widgets.HBox([prediction])
out = widgets.interactive_output(constant_prediction, {'prediction': prediction})
display(ui, out)

HBox(children=(FloatSlider(value=-3.0, description='prediction', max=3.0, min=-3.0, step=0.5),))

Output()


mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title='A good prediction is the mean height of sons (0 SUs)', figsize=(10, 5));
plt.axhline(0, color='orange', lw=4);
plt.xlim(-3, 3);


def linear_prediction(slope):
    x = np.linspace(-3, 3)
    y = x * slope
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', figsize=(10, 5));
    plt.plot(x, y, color='orange', lw=4)
    plt.xlim(-3, 3)
    plt.title(r"Predicting sons' heights using $\mathrm{son}_{\mathrm{(su)}}$ = " + str(np.round(slope, 2)) + r"$ \cdot \mathrm{mother}_{\mathrm{(su)}}$")
    plt.show()

slope = widgets.FloatSlider(value=0, min=-1,max=1,step=1/6, description='slope')
ui = widgets.HBox([slope])
out = widgets.interactive_output(linear_prediction, {'slope': slope})
display(ui, out)

HBox(children=(FloatSlider(value=0.0, description='slope', max=1.0, min=-1.0, step=0.16666666666666666),))

Output()


x = np.linspace(-3, 3)
y = x * r_mom_son
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=r'A good line goes through the origin and has slope $r$', figsize=(10, 5));
plt.plot(x, y, color='orange', label='regression line', lw=4)
plt.xlim(-3, 3)
plt.legend();


mom_mean = mom_son.get('mom').mean()
mom_sd = np.std(mom_son.get('mom'))
son_mean = mom_son.get('son').mean()
son_sd = np.std(mom_son.get('son'))


def predict_with_r(mom):
    """Return a prediction for the height of a son whose mother has height mom, 
    using linear regression.
    """
    mom_su = (mom - mom_mean) / mom_sd
    son_su = r_mom_son * mom_su
    return son_su * son_sd + son_mean


predict_with_r(68)

70.68219686848828


predict_with_r(60)

67.76170758654767


preds = mom_son.assign(
    predicted_height=mom_son.get('mom').apply(predict_with_r)
)
ax = preds.plot(kind='scatter', x='mom', y='son', title='Regression line predictions, in original units', figsize=(10, 5), label='original data')
preds.plot(kind='line', x='mom', y='predicted_height', ax=ax, color='orange', label='regression line', lw=4);
plt.legend();

	vehicle	year	price	acceleration	mpg	class
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact
1	Tino	2000	35354.97	8.20	54.10	Compact
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact
...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact

	acceleration (su)	mpg (su)	price (su)
0	-1.54	0.59	-6.94e-01
1	-1.28	1.76	-1.86e-01
2	-1.36	0.95	-5.85e-01
...	...	...	...
150	-0.07	0.75	-2.98e-01
151	-0.07	0.75	-2.90e-02
152	-0.29	0.20	-8.17e-03

	family	father	mother	midparentHeight	children	childNum	gender	childHeight
0	1	78.5	67.0	75.43	4	1	male	73.2
1	1	78.5	67.0	75.43	4	2	female	69.2
2	1	78.5	67.0	75.43	4	3	female	69.0
...	...	...	...	...	...	...	...	...
931	203	62.0	66.0	66.64	3	3	female	61.0
932	204	62.5	63.0	65.27	2	1	male	66.5
933	204	62.5	63.0	65.27	2	2	female	57.0

	mom	son
0	67.0	73.2
1	66.5	73.5
2	66.5	72.5
...	...	...
478	60.0	66.0
479	66.0	64.0
480	63.0	66.5

Lecture 24 – Correlation¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

What we've learned about inference¶

Moving forward¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Exploring the data¶

Measuring association¶

Correlation¶

Definition: Correlation coefficient¶

Example values of $r$¶

Calculating $r$¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Linear transformations¶

Why does the formula for $r$ involve standard units?¶

Scatter plots in standard units¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Interpreting $r$¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Regression¶

Example: Predicting heights 👪 📏¶

Predicting an adult son's height given his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Making predictions in original units¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary, next time¶

Summary¶

Next time¶

Lecture 24 – Correlation¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

What we've learned about inference¶

Moving forward¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Exploring the data¶

Measuring association¶

Correlation¶

Definition: Correlation coefficient¶

Example values of $r$¶

Calculating $r$¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Linear transformations¶

Why does the formula for $r$ involve standard units?¶

Scatter plots in standard units¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Interpreting $r$¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Regression¶

Example: Predicting heights 👪 📏¶

Predicting an adult son's height given his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Making predictions in original units¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

Next time¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶