# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = (10, 5)

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display
import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')

# Demonstration code
def r_scatter(r):
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r * x + (np.sqrt(1 - r ** 2)) * z
    plt.scatter(x, y)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    
def show_scatter_grid():
    plt.subplots(1, 4, figsize=(10, 2))
    for i, r in enumerate([-1, -2/3, -1/3, 0]):
        plt.subplot(1, 4, i+1)
        r_scatter(r)
        plt.title(f'r = {np.round(r, 2)}')
    plt.show()
    plt.subplots(1, 4, figsize=(10, 2))
    for i, r in enumerate([1, 2/3, 1/3]):
        plt.subplot(1, 4, i+1)
        r_scatter(r)
        plt.title(f'r = {np.round(r, 2)}')
    plt.subplot(1, 4, 4)
    plt.axis('off')
    plt.show()


hybrid = bpd.read_csv('data/hybrid.csv')
hybrid


hybrid.plot(kind='scatter', x='acceleration', y='price');


hybrid.plot(kind='scatter', x='mpg', y='price');


hybrid.assign(
    km_per_liter=hybrid.get('mpg') * 0.425144,
    yen=hybrid.get('price') * 139.77 
).plot(kind='scatter', x='km_per_liter', y='yen');


def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    any_numbers = np.array(any_numbers)
    return (any_numbers - any_numbers.mean()) / np.std(any_numbers)


def standardize(df):
    """Return a DataFrame in which all columns of df are converted to standard units."""
    df_su = bpd.DataFrame()
    for column in df.columns:
        df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
    return df_su


hybrid_su = standardize(hybrid.get(['price', 'acceleration', 'mpg'])).assign(vehicle=hybrid.get('vehicle'))
hybrid_su


hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)');


hybrid_su[(hybrid_su.get('acceleration (su)') > 2) &
          (hybrid_su.get('price (su)') > 2)]


hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)');


hybrid_su[(hybrid_su.get('mpg (su)') <= 0.3) &
          (hybrid_su.get('mpg (su)') >= -0.3) &
          (hybrid_su.get('price (su)') <= 0.3) &
          (hybrid_su.get('price (su)') >= -0.3)]

r = (x_su * y_su).mean()


def calculate_r(df, x, y):
    x_su = df.get(x + ' (su)')
    y_su = df.get(y + ' (su)')
    return (x_su * y_su).mean()


hybrid_su


r_acc_price = calculate_r(hybrid_su, 'acceleration', 'price')
r_acc_price

0.6955778996913982


hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');


hybrid_su


r_mpg_price = calculate_r(hybrid_su, 'mpg', 'price')
r_mpg_price

-0.5318263633683789


hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)');
plt.axvline(0, color='black');
plt.axhline(0, color='black');


show_scatter_grid()


x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5), 
    y=np.arange(-6, 6.1, 0.5) ** 2
)
x2.plot(kind='scatter', x='x', y='y');


galton = bpd.read_csv('data/galton.csv')
galton


male_children = galton[galton.get('gender') == 'male']
mom_son = bpd.DataFrame().assign(mom = male_children.get('mother'), 
                                 son = male_children.get('childHeight'))
mom_son


mom_son.plot(kind='scatter', x='mom', y='son');


mom_son_su = standardize(mom_son)
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)');


r_mom_son = calculate_r(mom_son_su, 'mom', 'son')
r_mom_son

0.32300498368490554


def constant_prediction(prediction):
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=f'Predicting a height of {prediction} SUs for all sons', figsize=(10, 5));
    plt.axhline(prediction, color='orange', lw=4);
    plt.xlim(-3, 3)
    plt.show()

prediction = widgets.FloatSlider(value=-3, min=-3,max=3,step=0.5, description='prediction')
ui = widgets.HBox([prediction])
out = widgets.interactive_output(constant_prediction, {'prediction': prediction})
display(ui, out)

HBox(children=(FloatSlider(value=-3.0, description='prediction', max=3.0, min=-3.0, step=0.5),))

Output()


mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title='A good prediction is the mean height of sons (0 SUs)', figsize=(10, 5));
plt.axhline(0, color='orange', lw=4);
plt.xlim(-3, 3);


def linear_prediction(slope):
    x = np.linspace(-3, 3)
    y = x * slope
    mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', figsize=(10, 5));
    plt.plot(x, y, color='orange', lw=4)
    plt.xlim(-3, 3)
    plt.title(r"Predicting sons' heights using $\mathrm{son}_{\mathrm{(su)}}$ = " + str(np.round(slope, 2)) + r"$ \cdot \mathrm{mother}_{\mathrm{(su)}}$")
    plt.show()

slope = widgets.FloatSlider(value=0, min=-1,max=1,step=1/6, description='slope')
ui = widgets.HBox([slope])
out = widgets.interactive_output(linear_prediction, {'slope': slope})
display(ui, out)

HBox(children=(FloatSlider(value=0.0, description='slope', max=1.0, min=-1.0, step=0.16666666666666666),))

Output()


x = np.linspace(-3, 3)
y = x * r_mom_son
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=r'A good line goes through the origin and has slope $r$', figsize=(10, 5));
plt.plot(x, y, color='orange', label='regression line', lw=4)
plt.xlim(-3, 3)
plt.legend();


mom_mean = mom_son.get('mom').mean()
mom_sd = np.std(mom_son.get('mom'))
son_mean = mom_son.get('son').mean()
son_sd = np.std(mom_son.get('son'))


def predict_with_r(mom):
    """Return a prediction for the height of a son whose mother has height mom, 
    using linear regression.
    """
    mom_su = (mom - mom_mean) / mom_sd
    son_su = r_mom_son * mom_su
    return son_su * son_sd + son_mean


predict_with_r(68)

70.68219686848828


predict_with_r(60)

67.76170758654767


preds = mom_son.assign(
    predicted_height=mom_son.get('mom').apply(predict_with_r)
)
ax = preds.plot(kind='scatter', x='mom', y='son', title='Regression line predictions, in original units', figsize=(10, 5), label='original data')
preds.plot(kind='line', x='mom', y='predicted_height', ax=ax, color='orange', label='regression line', lw=4);
plt.legend();

	vehicle	year	price	acceleration	mpg	class
0	Prius (1st Gen)	1997	24509.74	7.46	41.26	Compact
1	Tino	2000	35354.97	8.20	54.10	Compact
2	Prius (2nd Gen)	2000	26832.25	7.97	45.23	Compact
...	...	...	...	...	...	...
150	C-Max Energi Plug-in	2013	32950.00	11.76	43.00	Midsize
151	Fusion Energi Plug-in	2013	38700.00	11.76	43.00	Midsize
152	Chevrolet Volt	2013	39145.00	11.11	37.00	Compact

	price (su)	acceleration (su)	mpg (su)	vehicle
0	-6.94e-01	-1.54	0.59	Prius (1st Gen)
1	-1.86e-01	-1.28	1.76	Tino
2	-5.85e-01	-1.36	0.95	Prius (2nd Gen)
...	...	...	...	...
150	-2.98e-01	-0.07	0.75	C-Max Energi Plug-in
151	-2.90e-02	-0.07	0.75	Fusion Energi Plug-in
152	-8.17e-03	-0.29	0.20	Chevrolet Volt

	price (su)	acceleration (su)	mpg (su)	vehicle
47	2.71	2.05	-1.46	ActiveHybrid X6
60	3.04	2.88	-1.16	ActiveHybrid 7
95	2.96	2.12	-1.35	ActiveHybrid 7i
146	2.11	2.12	-0.90	ActiveHybrid 7L
147	2.66	2.24	-0.90	Panamera S

	price (su)	acceleration (su)	mpg (su)	vehicle
10	-1.24e-01	-0.56	-0.26	Escape
22	-2.13e-01	-1.02	-0.17	Mercury Mariner
57	-8.47e-02	0.72	-0.11	Audi Q5
...	...	...	...	...
70	-2.14e-01	-0.07	0.02	HS 250h
102	-2.69e-03	-0.29	0.20	Chevrolet Volt
152	-8.17e-03	-0.29	0.20	Chevrolet Volt

	price (su)	acceleration (su)	mpg (su)	vehicle
0	-6.94e-01	-1.54	0.59	Prius (1st Gen)
1	-1.86e-01	-1.28	1.76	Tino
2	-5.85e-01	-1.36	0.95	Prius (2nd Gen)
...	...	...	...	...
150	-2.98e-01	-0.07	0.75	C-Max Energi Plug-in
151	-2.90e-02	-0.07	0.75	Fusion Energi Plug-in
152	-8.17e-03	-0.29	0.20	Chevrolet Volt

Lecture 24 – Correlation¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Linear changes in units¶

Converting columns to standard units¶

Standard units for hybrid cars¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Observation on associations in standard units¶

Correlation¶

Definition: Correlation coefficient¶

The correlation coefficient, $r$¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Regression¶

Example: Predicting heights 👪 📏¶

Mothers and sons 👵👨¶

Predicting a son's height based on his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Making predictions in original units¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Summary, next time¶

Summary¶

Next time¶

	family	father	mother	midparentHeight	children	childNum	gender	childHeight
0	1	78.5	67.0	75.43	4	1	male	73.2
1	1	78.5	67.0	75.43	4	2	female	69.2
2	1	78.5	67.0	75.43	4	3	female	69.0
...	...	...	...	...	...	...	...	...
931	203	62.0	66.0	66.64	3	3	female	61.0
932	204	62.5	63.0	65.27	2	1	male	66.5
933	204	62.5	63.0	65.27	2	2	female	57.0

Lecture 24 – Correlation¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

Association¶

Prediction¶

Association¶

Example: Hybrid cars 🚗¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Linear changes in units¶

Converting columns to standard units¶

Standard units for hybrid cars¶

'price' vs. 'acceleration'¶

'price' vs. 'mpg'¶

Observation on associations in standard units¶

Correlation¶

Definition: Correlation coefficient¶

The correlation coefficient, $r$¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Regression¶

Example: Predicting heights 👪 📏¶

Mothers and sons 👵👨¶

Predicting a son's height based on his mother's height¶

Many possible ways to make predictions¶

Better predictions¶

The regression line¶

Making predictions in standard units¶

Making predictions in original units¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

Next time¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

`'price'` vs. `'acceleration'`¶

`'price'` vs. `'mpg'`¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶