# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
import ipywidgets as widgets
from IPython.display import display, HTML


delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');


delays.get('Delay').mean()

16.658155515370705


delays.get('Delay').median()

2.0


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', alpha=0.65, figsize=(10, 5), title='Flight Delays')
plt.plot([delays.get('Delay').mean(), delays.get('Delay').mean()], [0, 1], color='green', label='Mean', linewidth=2)
plt.scatter([delays.get('Delay').mean()], [-0.0017], color='green', marker='^', s=250)
plt.plot([delays.get('Delay').median(), delays.get('Delay').median()], [0, 1], color='purple', label='Median', linewidth=2)
plt.xlabel('Delay (minutes)')
plt.ylim(-0.005, 0.065)
plt.legend();


data = np.array([2, 3, 3, 9])
np.mean(data)

4.25


deviations = data - np.mean(data)
deviations

array([-2.25, -1.25, -1.25,  4.75])


np.mean(deviations)

0.0


# Square all the deviations:
deviations ** 2

array([ 5.06,  1.56,  1.56, 22.56])


variance = np.mean(deviations ** 2)
variance

7.6875


# Standard deviation (SD) is the square root of the variance.
sd = variance ** 0.5
sd

2.7726341266023544


# Note that this evaluates to the same number we found on the previous slide.
np.std(data)

2.7726341266023544


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');


delay_mean = delays.get('Delay').mean()
delay_mean

16.658155515370705


delay_std = np.std(delays.get('Delay')) # There is no .std() method in babypandas!
delay_std

39.480199851609314


delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)


delay_mean - 3 * delay_std, delay_mean + 3 * delay_std

(-101.78244403945723, 135.09875507019865)


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, alpha=0.65, ec='w', figsize=(10, 5), title='Flight Delays')
plt.axvline(delay_mean - 2 * delay_std, color='maroon', label='± 2 SD')
plt.axvline(delay_mean + 2 * delay_std, color='maroon')

plt.axvline(delay_mean + 3 * delay_std, color='blue',  label='± 3 SD')
plt.axvline(delay_mean - 3 * delay_std, color='blue')

plt.axvline(delay_mean, color='green', label='Mean')
plt.scatter([delay_mean], [-0.0017], color='green', marker='^', s=250)
plt.ylim(-0.0038, 0.06)
plt.legend();


delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)


within_2_sds = delays[(delays.get('Delay') >= delay_mean - 2 * delay_std) & 
                      (delays.get('Delay') <= delay_mean + 2 * delay_std)]

within_2_sds.shape[0] / delays.shape[0]

0.9560940325497288


height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight


height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=30, alpha=0.8, figsize=(10, 5));


height_and_weight.plot(kind='hist', y='Weight', density=True, ec='w', bins=30, alpha=0.8, color='C1', figsize=(10, 5));


height_and_weight.plot(kind='hist', density=True, ec='w', bins=60, alpha=0.8, figsize=(10, 5));


weights = height_and_weight.get('Weight')
(225 - weights.mean()) / np.std(weights)

1.920169918158094


def standard_units(col):
    return (col - col.mean()) / np.std(col)


standardized_height = standard_units(height_and_weight.get('Height'))
standardized_height

0       1.68
1      -0.09
2       1.78
        ... 
4997   -0.70
4998    0.88
4999    0.46
Name: Height, Length: 5000, dtype: float64


standardized_weight = standard_units(height_and_weight.get('Weight'))
standardized_weight

0       2.77
1      -1.25
2       1.30
        ... 
4997    0.62
4998   -0.06
4999    0.60
Name: Weight, Length: 5000, dtype: float64


# e-14 means 10^(-14), which is a very small number, effectively zero.
standardized_height.describe()

count    5.00e+03
mean     1.64e-14
std      1.00e+00
           ...   
50%      4.76e-04
75%      6.85e-01
max      3.48e+00
Name: Height, Length: 8, dtype: float64


standardized_weight.describe()

count    5.00e+03
mean     1.64e-14
std      1.00e+00
           ...   
50%      6.53e-04
75%      6.74e-01
max      4.19e+00
Name: Weight, Length: 8, dtype: float64


HTML('data/height_anim.html')


HTML('data/weight_anim.html')


standardized_height_and_weight = bpd.DataFrame().assign(
    Height=standardized_height,
    Weight=standardized_weight
)


standardized_height_and_weight.plot(kind='hist', density=True, ec='w',bins=30, alpha=0.8, figsize=(10, 5));


def normal_curve(z):
    return 1 / np.sqrt(2*np.pi) * np.exp((-z**2)/2)


x = np.linspace(-4, 4, 1000)
y = normal_curve(x)

plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black');
plt.xlabel('$z$');
plt.title(r'$\phi(z) = \frac{1}{\sqrt{2 \pi}} e^{-\frac{1}{2}z^2}$');


standardized_height_and_weight.plot(kind='hist', density=True, ec='w', bins=120, alpha=0.8, figsize=(10, 5));
plt.plot(x, y, color='black', linestyle='--', label='Normal', linewidth=5)
plt.legend(loc='upper right');


def normal_area(a, b, bars=False):
    x = np.linspace(-4, 4, 1000)
    y = normal_curve(x)
    ix = (x >= a) & (x <= b)
    plt.figure(figsize=(10, 5))
    plt.plot(x, y, color='black')
    plt.fill_between(x[ix], y[ix], color='gold')
    if bars:
        plt.axvline(a, color='red')
        plt.axvline(b, color='red')
    plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
    plt.show()


a = widgets.FloatSlider(value=0, min=-4,max=3,step=0.25, description='a')
b = widgets.FloatSlider(value=1, min=-4,max=4,step=0.25, description='b')
bars = widgets.Checkbox(value=False, description='bars')
ui = widgets.HBox([a, b, bars])
out = widgets.interactive_output(normal_area, {'a': a, 'b': b, 'bars': bars})
display(ui, out)

HBox(children=(FloatSlider(value=0.0, description='a', max=3.0, min=-4.0, step=0.25), FloatSlider(value=1.0, d…

Output()


normal_area(-np.inf, 0)


from scipy import stats
stats.norm.cdf(0)

0.5


normal_area(2, np.inf)


stats.norm.cdf(2)

0.9772498680518208


normal_area(-np.inf, 2)


1 - stats.norm.cdf(2)

0.02275013194817921


normal_area(-1, 0)


stats.norm.cdf(0) - stats.norm.cdf(-1)

0.3413447460685429

stats.norm.cdf(b) - stats.norm.cdf(a)


height_and_weight


weight_mean = weights.mean()
weight_mean

187.02062065819288


weight_std = np.std(weights)
weight_std

19.779176302396458


left = (200 - weight_mean) / weight_std
left

0.6562143510614508


right = (225 - weight_mean) / weight_std
right

1.920169918158094


normal_area(left, right)


approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation

0.22842488819306006


# True proportion of values between 200 and 225.
height_and_weight[
    (height_and_weight.get('Weight') >= 200) &
    (height_and_weight.get('Weight') <= 225)
].shape[0] / height_and_weight.shape[0]

0.2294


# Approximation using the standard normal curve.
approximation

0.22842488819306006


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');


HTML('data/delay_anim.html')

Range	Proportion
mean ± 2 SDs	at least $1 - \frac{1}{4}$ (75%)
mean ± 3 SDs	at least $1 - \frac{1}{9}$ (88.88..%)
mean ± 4 SDs	at least $1 - \frac{1}{16}$ (93.75%)
mean ± 5 SDs	at least $1 - \frac{1}{25}$ (96%)

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

Lecture 20 – Spread, The Normal Distribution¶

DSC 10, Winter 2023¶

Announcements¶

Agenda¶

Recap: Mean and median¶

Example: Flight delays ✈️¶

Comparing the mean and median¶

Standard deviation¶

Question: How "wide" is a distribution?¶

Deviations from the mean¶

Average squared deviation¶

Standard deviation¶

Standard deviation¶

Variance and standard deviation¶

What can we do with the standard deviation?¶

Chebyshev’s inequality¶

Flight delays, revisited¶

Mean and standard deviation¶

Chebyshev's inequality provides lower bounds!¶

Activity¶

Standardization¶

Heights and weights 📏¶

Distributions of height and weight¶

Standard units¶

Standardization¶

The effect of standardization¶

Standardized histograms¶

The standard normal distribution¶

The standard normal distribution¶

The standard normal curve¶

Heights/weights are roughly normal¶

The standard normal distribution¶

Cumulative density functions¶

Areas under the standard normal curve¶

Areas under the standard normal curve¶

Areas under the standard normal curve¶

General strategy for finding area¶

Using the normal distribution¶

Standard units and the normal distribution¶

Example: Proportion of weights between 200 and 225 pounds¶

Checking the approximation¶

Warning: Standardization doesn't make a distribution normal!¶

Summary, next time¶

Summary: Spread and Chebyshev's inequality¶

Summary: Standard units and the normal distribution¶

Next time¶