# Run this cell to set up packages for lecture.
from lec24_imports import *

hybrid = bpd.read_csv('data/hybrid.csv')
hybrid

hybrid.plot(kind='scatter', x='acceleration', y='price');

hybrid.plot(kind='scatter', x='mpg', y='price');

px.scatter(hybrid.to_df(), x='mpg', y='price', hover_name='vehicle')

show_scatter_grid()

widgets.interact(r_scatter, r=(-1, 1, 0.05));

interactive(children=(FloatSlider(value=0.0, description='r', max=1.0, min=-1.0, step=0.05), Output()), _dom_c…

def standard_units(col):
    return (col - col.mean()) / np.std(col)

def calculate_r(df, x, y):
    '''Returns the average value of the product of x and y, 
       when both are measured in standard units.'''
    x_su = standard_units(df.get(x))
    y_su = standard_units(df.get(y))
    return (x_su * y_su).mean()

hybrid.plot(kind='scatter', x='acceleration', y='price');

calculate_r(hybrid, 'acceleration', 'price')

0.6955778996913978

hybrid.plot(kind='scatter', x='mpg', y='price');

calculate_r(hybrid, 'mpg', 'price')

-0.5318263633683786

hybrid.plot(kind='scatter', x='mpg', y='price', title='price (dollars) vs. mpg');

hybrid.assign(
    price_yen=hybrid.get('price') * 149.99, # The current USD to Japanese Yen exchange rate.
    kpg=hybrid.get('mpg') * 1.6             # 1 mile is 1.6 kilometers.
).plot(kind='scatter', x='kpg', y='price_yen', title='price (yen) vs. kpg');

def standardize(df):
    """Return a DataFrame in which all columns of df are converted to standard units."""
    df_su = bpd.DataFrame()
    for column in df.columns:
        # This uses syntax that is out of scope; don't worry about how it works.
        df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
    return df_su

hybrid_su = standardize(hybrid.get(['acceleration', 'mpg', 'price']))
hybrid_su

hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)')
plt.axvline(0, color='black')
plt.axhline(0, color='black');

calculate_r(hybrid, 'acceleration', 'price')

0.6955778996913978

hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');

calculate_r(hybrid, 'mpg', 'price')

-0.5318263633683786

# Once again, run this cell and play with the slider that appears!
widgets.interact(r_scatter, r=(-1, 1, 0.05));

interactive(children=(FloatSlider(value=0.0, description='r', max=1.0, min=-1.0, step=0.05), Output()), _dom_c…

x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5), 
    y=np.arange(-6, 6.1, 0.5) ** 2
)
x2.plot(kind='scatter', x='x', y='y');

galton = bpd.read_csv('data/galton.csv')
galton

male_children = galton[galton.get('gender') == 'male'].reset_index()
mom_son = bpd.DataFrame().assign(mom=male_children.get('mother'), son=male_children.get('childHeight'))
mom_son

mom_son.plot(kind='scatter', x='mom', y='son');

r_mom_son = calculate_r(mom_son, 'mom', 'son')
r_mom_son

0.3230049836849053

mom_son_su = standardize(mom_son)

def constant_prediction(prediction):
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=f'Predicting a height of {prediction} SUs for all sons', figsize=(10, 5));
    plt.axhline(prediction, color='orange', lw=4);
    plt.xlim(-3, 3)
    plt.show()

prediction = widgets.FloatSlider(value=-3, min=-3,max=3,step=0.5, description='prediction')
ui = widgets.HBox([prediction])
out = widgets.interactive_output(constant_prediction, {'prediction': prediction})
display(ui, out)

HBox(children=(FloatSlider(value=-3.0, description='prediction', max=3.0, min=-3.0, step=0.5),))

Output()

mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title='A good prediction is the mean height of sons (0 SUs)', figsize=(10, 5));
plt.axhline(0, color='orange', lw=4);
plt.xlim(-3, 3);

def linear_prediction(slope):
    x = np.linspace(-3, 3)
    y = x * slope
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', figsize=(10, 5));
    plt.plot(x, y, color='orange', lw=4)
    plt.xlim(-3, 3)
    plt.title(r"Predicting sons' heights using $\mathrm{son}_{\mathrm{(su)}}$ = " + str(np.round(slope, 2)) + r"$ \cdot \mathrm{mother}_{\mathrm{(su)}}$")
    plt.show()

slope = widgets.FloatSlider(value=0, min=-1,max=1,step=1/6, description='slope')
ui = widgets.HBox([slope])
out = widgets.interactive_output(linear_prediction, {'slope': slope})
display(ui, out)

HBox(children=(FloatSlider(value=0.0, description='slope', max=1.0, min=-1.0, step=0.16666666666666666),))

Output()

x = np.linspace(-3, 3)
y = x * r_mom_son
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=r'A good line goes through the origin and has slope $r$', figsize=(10, 5));
plt.plot(x, y, color='orange', label='regression line', lw=4)
plt.xlim(-3, 3)
plt.legend();

	vehicle	year	price	acceleration	mpg	class
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact
1	Tino	2000	35354.97	8.20	54.10	Compact
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact
...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact

	acceleration (su)	mpg (su)	price (su)
0	-1.54	0.59	-6.94e-01
1	-1.28	1.76	-1.86e-01
2	-1.36	0.95	-5.85e-01
...	...	...	...
150	-0.07	0.75	-2.98e-01
151	-0.07	0.75	-2.90e-02
152	-0.29	0.20	-8.17e-03

	family	father	mother	midparentHeight	children	childNum	gender	childHeight
0	1	78.5	67.0	75.43	4	1	male	73.2
1	1	78.5	67.0	75.43	4	2	female	69.2
2	1	78.5	67.0	75.43	4	3	female	69.0
...	...	...	...	...	...	...	...	...
931	203	62.0	66.0	66.64	3	3	female	61.0
932	204	62.5	63.0	65.27	2	1	male	66.5
933	204	62.5	63.0	65.27	2	2	female	57.0

	mom	son
0	67.0	73.2
1	66.5	73.5
2	66.5	72.5
...	...	...
478	60.0	66.0
479	66.0	64.0
480	63.0	66.5

Lecture 24 – Correlation¶

DSC 10, Spring 2024¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

What we've learned about inference¶

Moving forward¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Exploring the data¶

Measuring association¶

Correlation¶

Definition: Correlation coefficient¶

Example values of $r$¶

Calculating $r$¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Linear transformations¶

Why does the formula for $r$ involve standard units?¶

Scatter plots in standard units¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Interpreting $r$¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Regression¶

Example: Predicting heights 👪 📏¶

Predicting an adult son's height given his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Summary, next time¶

Summary¶

Next time¶

Lecture 24 – Correlation¶

DSC 10, Spring 2024¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

What we've learned about inference¶

Moving forward¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Exploring the data¶

Measuring association¶

Correlation¶

Definition: Correlation coefficient¶

Example values of $r$¶

Calculating $r$¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Linear transformations¶

Why does the formula for $r$ involve standard units?¶

Scatter plots in standard units¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Interpreting $r$¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Regression¶

Example: Predicting heights 👪 📏¶

Predicting an adult son's height given his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Summary, next time¶

Summary¶

Next time¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶