# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter
from scipy import stats
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, HTML, IFrame, clear_output
import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')

# Demonstration code
def r_scatter(r):
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r * x + (np.sqrt(1 - r ** 2)) * z
    plt.scatter(x, y)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    
def show_scatter_grid():
    plt.subplots(1, 4, figsize=(10, 2))
    for i, r in enumerate([-1, -2/3, -1/3, 0]):
        plt.subplot(1, 4, i+1)
        r_scatter(r)
        plt.title(f'r = {np.round(r, 2)}')
    plt.show()
    plt.subplots(1, 4, figsize=(10, 2))
    for i, r in enumerate([1, 2/3, 1/3]):
        plt.subplot(1, 4, i+1)
        r_scatter(r)
        plt.title(f'r = {np.round(r, 2)}')
    plt.subplot(1, 4, 4)
    plt.axis('off')
    plt.show()


hybrid = bpd.read_csv('data/hybrid.csv')
hybrid


hybrid.plot(kind='scatter', x='acceleration', y='price', figsize=(10, 5));


hybrid.plot(kind='scatter', x='mpg', y='price', figsize=(10, 5));


hybrid.assign(
    km_per_liter=hybrid.get('mpg') * 0.425144,
    yen=hybrid.get('price') * 140.34 
).plot(kind='scatter', x='km_per_liter', y='yen', figsize=(10, 5));


def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    any_numbers = np.array(any_numbers)
    return (any_numbers - any_numbers.mean()) / np.std(any_numbers)


def standardize(df):
    """Return a DataFrame in which all columns of df are converted to standard units."""
    df_su = bpd.DataFrame()
    for column in df.columns:
        df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
    return df_su


hybrid_su = standardize(hybrid.get(['price', 'acceleration', 'mpg'])).assign(vehicle=hybrid.get('vehicle'))
hybrid_su


hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)', figsize=(10, 5));


hybrid_su[(hybrid_su.get('acceleration (su)') > 2) &
          (hybrid_su.get('price (su)') > 2)]


hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)', figsize=(10, 5));


hybrid_su[(hybrid_su.get('mpg (su)') <= 0.3) &
          (hybrid_su.get('mpg (su)') >= -0.3) &
          (hybrid_su.get('price (su)') <= 0.3) &
          (hybrid_su.get('price (su)') >= -0.3)]


hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)', figsize=(10, 5))
plt.axvline(0, color='black');
plt.axhline(0, color='black');


hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)', figsize=(10, 5))
plt.axvline(0, color='black');
plt.axhline(0, color='black');

r = (x_su * y_su).mean()


hybrid_su


r_acc_price = (hybrid_su.get('acceleration (su)') * hybrid_su.get('price (su)')).mean()
r_acc_price

0.6955778996913982


hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)', figsize=(10, 5))
plt.axvline(0, color='black');
plt.axhline(0, color='black');


hybrid_su


r_mpg_price = (hybrid_su.get('mpg (su)') * hybrid_su.get('price (su)')).mean()
r_mpg_price

-0.5318263633683789


hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)', figsize=(10, 5));
plt.axvline(0, color='black');
plt.axhline(0, color='black');


show_scatter_grid()


x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5), 
    y=np.arange(-6, 6.1, 0.5) ** 2
)
x2.plot(kind='scatter', x='x', y='y', figsize=(10, 5));


galton = bpd.read_csv('data/galton.csv')
galton


male_children = galton[galton.get('gender') == 'male']
mom_son = bpd.DataFrame().assign(mom = male_children.get('mother'), 
                                 son = male_children.get('childHeight'))
mom_son


mom_son.plot(kind='scatter', x='mom', y='son', figsize=(10, 5));


mom_son_su = standardize(mom_son)
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', figsize=(10, 5));


r_mom_son = (mom_son_su.get('mom (su)') * mom_son_su.get('son (su)')).mean()
r_mom_son

0.32300498368490554


def constant_prediction(prediction):
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=f'Predicting a height of {prediction} SUs for all sons', figsize=(10, 5));
    plt.axhline(prediction, color='orange', lw=4);
    plt.xlim(-3, 3)
    plt.show()

prediction = widgets.FloatSlider(value=-3, min=-3,max=3,step=0.5, description='prediction')
ui = widgets.HBox([prediction])
out = widgets.interactive_output(constant_prediction, {'prediction': prediction})
display(ui, out)

HBox(children=(FloatSlider(value=-3.0, description='prediction', max=3.0, min=-3.0, step=0.5),))

Output()


mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title='A good prediction is the mean height of sons (0 SUs)', figsize=(10, 5));
plt.axhline(0, color='orange', lw=4);
plt.xlim(-3, 3);


def linear_prediction(slope):
    x = np.linspace(-3, 3)
    y = x * slope
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', figsize=(10, 5));
    plt.plot(x, y, color='orange', lw=4)
    plt.xlim(-3, 3)
    plt.title(r"Predicting sons' heights using $\mathrm{son}_{\mathrm{(su)}}$ = " + str(np.round(slope, 2)) + r"$ \cdot \mathrm{mother}_{\mathrm{(su)}}$")
    plt.show()

slope = widgets.FloatSlider(value=0, min=-1,max=1,step=1/6, description='slope')
ui = widgets.HBox([slope])
out = widgets.interactive_output(linear_prediction, {'slope': slope})
display(ui, out)

HBox(children=(FloatSlider(value=0.0, description='slope', max=1.0, min=-1.0, step=0.16666666666666666),))

Output()


x = np.linspace(-3, 3)
y = x * r_mom_son
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=r'A good line goes through the origin and has slope $r$', figsize=(10, 5));
plt.plot(x, y, color='orange', label='regression line', lw=4)
plt.xlim(-3, 3)
plt.legend();


mom_mean = mom_son.get('mom').mean()
mom_sd = np.std(mom_son.get('mom'))
son_mean = mom_son.get('son').mean()
son_sd = np.std(mom_son.get('son'))


def predict_with_r(mom):
    """Return a prediction for the height of a son whose mother has height mom, 
    using linear regression.
    """
    mom_su = (mom - mom_mean) / mom_sd
    son_su = r_mom_son * mom_su
    return son_su * son_sd + son_mean


predict_with_r(68)

70.68219686848828


predict_with_r(60)

67.76170758654767


preds = mom_son.assign(
    predicted_height=mom_son.get('mom').apply(predict_with_r)
)
ax = preds.plot(kind='scatter', x='mom', y='son', title='Regression line predictions, in original units', figsize=(10, 5), label='original data')
preds.plot(kind='line', x='mom', y='predicted_height', ax=ax, color='orange', label='regression line', lw=4);
plt.legend();

	vehicle	year	price	acceleration	mpg	class
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact
1	Tino	2000	35354.97	8.20	54.10	Compact
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact
...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact

	price (su)	acceleration (su)	mpg (su)	vehicle
0	-6.94e-01	-1.54	0.59	Prius (1st Gen)
1	-1.86e-01	-1.28	1.76	Tino
2	-5.85e-01	-1.36	0.95	Prius (2nd Gen)
...	...	...	...	...
150	-2.98e-01	-0.07	0.75	C-Max Energi Plug-in
151	-2.90e-02	-0.07	0.75	Fusion Energi Plug-in
152	-8.17e-03	-0.29	0.20	Chevrolet Volt

	price (su)	acceleration (su)	mpg (su)	vehicle
47	2.71	2.05	-1.46	ActiveHybrid X6
60	3.04	2.88	-1.16	ActiveHybrid 7
95	2.96	2.12	-1.35	ActiveHybrid 7i
146	2.11	2.12	-0.90	ActiveHybrid 7L
147	2.66	2.24	-0.90	Panamera S

	price (su)	acceleration (su)	mpg (su)	vehicle
10	-1.24e-01	-0.56	-0.26	Escape
22	-2.13e-01	-1.02	-0.17	Mercury Mariner
57	-8.47e-02	0.72	-0.11	Audi Q5
...	...	...	...	...
70	-2.14e-01	-0.07	0.02	HS 250h
102	-2.69e-03	-0.29	0.20	Chevrolet Volt
152	-8.17e-03	-0.29	0.20	Chevrolet Volt

	price (su)	acceleration (su)	mpg (su)	vehicle
0	-6.94e-01	-1.54	0.59	Prius (1st Gen)
1	-1.86e-01	-1.28	1.76	Tino
2	-5.85e-01	-1.36	0.95	Prius (2nd Gen)
...	...	...	...	...
150	-2.98e-01	-0.07	0.75	C-Max Energi Plug-in
151	-2.90e-02	-0.07	0.75	Fusion Energi Plug-in
152	-8.17e-03	-0.29	0.20	Chevrolet Volt

Lecture 23 – Correlation¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

Four big ideas in statistical inference¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

`'acceleration'` and `'price'`¶

`'mpg'` and `'price'`¶

Linear changes in units¶

Converting columns to standard units¶

Standard units for hybrid cars¶

`'acceleration'` and `'price'`¶

`'mpg'` and `'price'`¶

Observation on associations in standard units¶

Correlation¶

Definition: Correlation coefficient¶

The correlation coefficient, $r$¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Regression¶

Example: Predicting heights 👪 📏¶

Mothers and sons 👵👨¶

Predicting a son's height based on his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Making predictions in original units¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary, next time¶

Summary¶

Next time¶

	family	father	mother	midparentHeight	children	childNum	gender	childHeight
0	1	78.5	67.0	75.43	4	1	male	73.2
1	1	78.5	67.0	75.43	4	2	female	69.2
2	1	78.5	67.0	75.43	4	3	female	69.0
...	...	...	...	...	...	...	...	...
931	203	62.0	66.0	66.64	3	3	female	61.0
932	204	62.5	63.0	65.27	2	1	male	66.5
933	204	62.5	63.0	65.27	2	2	female	57.0

Lecture 23 – Correlation¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Recap: Statistical inference¶

Four big ideas in statistical inference¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

'acceleration' and 'price'¶

'mpg' and 'price'¶

Linear changes in units¶

Converting columns to standard units¶

Standard units for hybrid cars¶

'acceleration' and 'price'¶

'mpg' and 'price'¶

Observation on associations in standard units¶

Correlation¶

Definition: Correlation coefficient¶

The correlation coefficient, $r$¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Regression¶

Example: Predicting heights 👪 📏¶

Mothers and sons 👵👨¶

Predicting a son's height based on his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Making predictions in original units¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

Next time¶

`'acceleration'` and `'price'`¶

`'mpg'` and `'price'`¶

`'acceleration'` and `'price'`¶

`'mpg'` and `'price'`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶