# Run this cell to set up packages for lecture.
from lec16_imports import *


delays = bpd.read_csv('data/united_summer2015.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');


delay_mean = delays.get('Delay').mean()
delay_mean

16.658155515370705


delay_std = np.std(delays.get('Delay')) # There is no .std() method in babypandas!
delay_std

39.480199851609314


delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)


delay_mean - 3 * delay_std, delay_mean + 3 * delay_std

(-101.78244403945723, 135.09875507019865)


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, alpha=0.65, ec='w', figsize=(10, 5), title='Flight Delays')
plt.axvline(delay_mean - 2 * delay_std, color='maroon', label='± 2 SD')
plt.axvline(delay_mean + 2 * delay_std, color='maroon')

plt.axvline(delay_mean + 3 * delay_std, color='blue',  label='± 3 SD')
plt.axvline(delay_mean - 3 * delay_std, color='blue')

plt.axvline(delay_mean, color='green', label='Mean')
plt.scatter([delay_mean], [-0.0017], color='green', marker='^', s=250)
plt.ylim(-0.0038, 0.06)
plt.legend();


delay_mean - 2 * delay_std, delay_mean + 2 * delay_std

(-62.30224418784792, 95.61855521858934)


within_2_sds = delays[(delays.get('Delay') >= delay_mean - 2 * delay_std) & 
                      (delays.get('Delay') <= delay_mean + 2 * delay_std)]

within_2_sds.shape[0] / delays.shape[0]

0.9560940325497288


height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight


height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=30, alpha=0.8, figsize=(10, 5));


height_and_weight.plot(kind='hist', y='Weight', density=True, ec='w', bins=30, alpha=0.8, color='C1', figsize=(10, 5));


height_and_weight.plot(kind='hist', density=True, ec='w', bins=60, alpha=0.8, figsize=(10, 5));


show_many_normal_distributions()


weights = height_and_weight.get('Weight')
(225 - weights.mean()) / np.std(weights)

1.9201699181580782


def standard_units(col):
    return (col - col.mean()) / np.std(col)


standardized_height = standard_units(height_and_weight.get('Height'))
standardized_height

0       1.68
1      -0.09
2       1.78
        ... 
4997   -0.70
4998    0.88
4999    0.46
Name: Height, Length: 5000, dtype: float64


standardized_weight = standard_units(height_and_weight.get('Weight'))
standardized_weight

0       2.77
1      -1.25
2       1.30
        ... 
4997    0.62
4998   -0.06
4999    0.60
Name: Weight, Length: 5000, dtype: float64


# e-15 means 10^(-15), which is a very small number, effectively zero.
standardized_height.describe()

count    5.00e+03
mean     1.49e-15
std      1.00e+00
           ...   
50%      4.76e-04
75%      6.85e-01
max      3.48e+00
Name: Height, Length: 8, dtype: float64


standardized_weight.describe()

count    5.00e+03
mean     5.98e-16
std      1.00e+00
           ...   
50%      6.53e-04
75%      6.74e-01
max      4.19e+00
Name: Weight, Length: 8, dtype: float64


HTML('data/height_anim.html')


HTML('data/weight_anim.html')


standardized_height_and_weight = bpd.DataFrame().assign(
    Height=standardized_height,
    Weight=standardized_weight
)
standardized_height_and_weight.plot(kind='hist', density=True, ec='w',bins=30, alpha=0.8, figsize=(10, 5));


def normal_curve(z):
    return 1 / np.sqrt(2 * np.pi) * np.exp((-z**2)/2)

x = np.linspace(-4, 4, 1000)
y = normal_curve(x)

plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black');
plt.xlabel('$z$');
plt.title(r'$\phi(z) = \frac{1}{\sqrt{2 \pi}} e^{-\frac{1}{2}z^2}$');


standardized_height_and_weight.plot(kind='hist', density=True, ec='w', bins=120, alpha=0.8, figsize=(10, 5));
plt.plot(x, y, color='black', linestyle='--', label='Normal', linewidth=5)
plt.legend(loc='upper right');


sliders()

HBox(children=(FloatSlider(value=0.0, description='a', max=3.0, min=-4.0, step=0.25), FloatSlider(value=1.0, d…

Output()


# cdf(0) should give us the gold area below.
normal_area(-np.inf, 0)


normal_area(-np.inf, 0)


from scipy import stats
stats.norm.cdf(0)

0.5


normal_area(2, np.inf)


stats.norm.cdf(2)

0.9772498680518208


normal_area(-np.inf, 2)


1 - stats.norm.cdf(2)

0.02275013194817921


normal_area(-1, 0)


stats.norm.cdf(0) - stats.norm.cdf(-1)

0.3413447460685429

stats.norm.cdf(b) - stats.norm.cdf(a)


height_and_weight


weight_mean = weights.mean()
weight_mean

187.0206206581932


weight_std = np.std(weights)
weight_std

19.779176302396458


left = (200 - weight_mean) / weight_std
left

0.656214351061435


right = (225 - weight_mean) / weight_std
right

1.9201699181580782


normal_area(left, right)


approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation

0.22842488819306406


# True proportion of values between 200 and 225.
height_and_weight[
    (height_and_weight.get('Weight') >= 200) &
    (height_and_weight.get('Weight') <= 225)
].shape[0] / height_and_weight.shape[0]

0.2294


# Approximation using the standard normal curve.
approximation

0.22842488819306406


delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');


HTML('data/delay_anim.html')


normal_area(-1, 1, bars=True)


stats.norm.cdf(1) - stats.norm.cdf(-1)

0.6826894921370859


normal_area(-2, 2, bars=True)


stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416


normal_area(-1, 1)


height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=40, alpha=0.8, figsize=(10, 5));
plt.xticks(np.arange(60, 78, 2));


np.std(height_and_weight.get('Height'))

2.863075878119538

Range	Proportion
mean ± 2 SDs	at least $1 - \frac{1}{4}$ (75%)
mean ± 3 SDs	at least $1 - \frac{1}{9}$ (88.88..%)
mean ± 4 SDs	at least $1 - \frac{1}{16}$ (93.75%)
mean ± 5 SDs	at least $1 - \frac{1}{25}$ (96%)

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

Lecture 16 – Standardization and the Normal Distribution¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Chebyshev's inequality¶

Recap: variance and standard deviation¶

What can we do with the standard deviation?¶

Chebyshev’s inequality¶

Flight delays, revisited ✈️¶

Mean and standard deviation¶

Chebyshev's inequality provides lower bounds!¶

Activity¶

Standardization¶

Heights and weights 📏¶

Distributions of height and weight¶

Many normal distributions¶

Standard units¶

Standardization¶

The effect of standardization¶

Standardized histograms¶

The standard normal distribution¶

The standard normal distribution¶

The standard normal curve¶

Heights/weights are roughly normal¶

The standard normal distribution¶

Cumulative density functions¶

Areas under the standard normal curve¶

Areas under the standard normal curve¶

Areas under the standard normal curve¶

General strategy for finding area¶

Using the normal distribution¶

Standard units and the normal distribution¶

Example: Proportion of weights between 200 and 225 pounds¶

Checking the approximation¶

Warning: Standardization doesn't make a distribution normal!¶

Chebyshev's inequality and the normal distribution¶

68% of values are within 1 SD of the mean¶

95% of values are within 2 SDs of the mean¶

Recap: Proportion of values within $z$ SDs of the mean¶

Inflection points¶

Example: Inflection points¶

Summary, next time¶

Summary: Spread and Chebyshev's inequality¶

Summary: Standard units and the normal distribution¶

Next time¶