# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
import time
from IPython.display import display, HTML, IFrame, clear_output
import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')

def normal_curve(x, mu=0, sigma=1):
    return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))

def normal_area(a, b, bars=False):
    x = np.linspace(-4, 4, 1000)
    y = normal_curve(x)
    ix = (x >= a) & (x <= b)
    plt.figure(figsize=(10, 5))
    plt.plot(x, y, color='black')
    plt.fill_between(x[ix], y[ix], color='gold')
    if bars:
        plt.axvline(a, color='red')
        plt.axvline(b, color='red')
    plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
    plt.show()

def show_clt_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 509
    display(IFrame(src, width, height))


height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight


heights = height_and_weight.get('Height')
height_mean = heights.mean()
height_mean

69.02634590621737


height_std = np.std(heights)
height_std

2.863075878119538


left = (65 - height_mean) / height_std
left

-1.4063008029189459


right = (70 - height_mean) / height_std
right

0.3400727522534686


normal_area(left, right)


from scipy import stats
approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation

0.5532817187111865


# True proportion of values between 65 and 70.
height_and_weight[
    (height_and_weight.get('Height') >= 65) &
    (height_and_weight.get('Height') <= 70)
].shape[0] / height_and_weight.shape[0]

0.554


# Approximation using the standard normal curve.
approximation

0.5532817187111865


delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');


HTML('data/delay_anim.html')


normal_area(-1, 1, bars=True)


stats.norm.cdf(1) - stats.norm.cdf(-1)

0.6826894921370859


normal_area(-2, 2, bars=True)


stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416


height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=40, alpha=0.8, figsize=(10, 5));
plt.xticks(np.arange(60, 78, 2));


np.std(height_and_weight.get('Height'))

2.863075878119538


delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Population Distribution of Flight Delays')
plt.xlabel('Delay (minutes)');


delays.get('Delay').describe()

count    13825.00
mean        16.66
std         39.48
           ...   
50%          2.00
75%         18.00
max        580.00
Name: Delay, Length: 8, dtype: float64


sample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    sample = delays.sample(500)
    sample_mean = sample.get('Delay').mean()
    sample_means = np.append(sample_means, sample_mean)
    
sample_means

array([15.65, 17.02, 16.58, ..., 18.76, 16.87, 13.23])


bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([sample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(sample_means.mean(), color='green', label=f'mean={np.round(sample_means.mean(), 2)}', linewidth=4)
plt.xlim(5, 30)
plt.ylim(-0.013, 0.26)
plt.legend();


def sample_mean_delays(sample_size):
    sample_means = np.array([])
    for i in np.arange(2000):
        sample = delays.sample(sample_size)
        sample_mean = sample.get('Delay').mean()
        sample_means = np.append(sample_means, sample_mean)
    return sample_means


sample_means = {}
sample_sizes = [5, 10, 50, 100, 200, 400, 800, 1600]

for size in sample_sizes:
    sample_means[size] = sample_mean_delays(size)


# Plot the resulting distributions.
bins = np.arange(5, 30, 0.5)
for size in sample_sizes:
    bpd.DataFrame().assign(data=sample_means[size]).plot(kind='hist', bins=bins, density=True, ec='w', title=f'Distribution of the Sample Mean for Samples of Size {size}', figsize=(8, 4))
    plt.legend('');
    plt.show()
    time.sleep(1.5)
    if size != sample_sizes[-1]:
        clear_output()


# Compute the standard deviation of each distribution.
sds = np.array([])
for size in sample_sizes:
    sd = np.std(sample_means[size])
    sds = np.append(sds, sd)
sds

array([17.97, 12.21,  5.44,  3.93,  2.72,  1.97,  1.32,  0.93])


observed = bpd.DataFrame().assign(
    SampleSize=sample_sizes,
    StandardDeviation=sds
)

observed.plot(kind='scatter', x='SampleSize', y='StandardDeviation', s=70, title="Standard Deviation of the Distribution of the Sample Mean vs. Sample Size", figsize=(10, 5));


np.random.seed(42)
my_sample = delays.sample(500)
my_sample.get('Delay').describe()

count    500.00
mean      13.01
std       28.00
          ...  
50%        3.00
75%       16.00
max      209.00
Name: Delay, Length: 8, dtype: float64


resample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    resample = my_sample.sample(500, replace=True)
    resample_mean = resample.get('Delay').mean()
    resample_means = np.append(resample_means, resample_mean)
    
resample_means

array([12.65, 11.5 , 11.34, ..., 12.59, 11.89, 12.58])


bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([resample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(resample_means.mean(), color='green', label=f'mean={np.round(resample_means.mean(), 2)}', linewidth=4)
plt.xlim(7, 20)
plt.ylim(-0.015, 0.35)
plt.legend();


samp_mean_mean = my_sample.get('Delay').mean()
samp_mean_mean

13.008


samp_mean_sd = np.std(my_sample.get('Delay')) / np.sqrt(my_sample.shape[0])
samp_mean_sd

1.2511114546674091


norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
bpd.DataFrame().assign(Bootstrapping=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='CLT')
plt.title('Distribution of the Sample Mean, Using Two Methods')
plt.xlim(7, 20)
plt.legend();


show_clt_slides()


left_boot = np.percentile(resample_means, 2.5)
right_boot = np.percentile(resample_means, 97.5)
[left_boot, right_boot]

[10.7159, 15.43405]


bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', y='resample_means', alpha=0.65, bins=20, density=True, ec='w', figsize=(10, 5), title='Distribution of Bootstrapped Sample Means');
plt.plot([left_boot, right_boot], [0, 0], color='gold', linewidth=10, label='95% bootstrap-based confidence interval');
plt.xlim(7, 20);
plt.legend();


plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.legend();


normal_area(-2, 2)


stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416


my_delays = my_sample.get('Delay')
left_normal = my_delays.mean() - 2 * np.std(my_delays) / np.sqrt(500)
right_normal = my_delays.mean() + 2 * np.std(my_delays) / np.sqrt(500)
[left_normal, right_normal]

[10.50577709066518, 15.510222909334818]


plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.ylim(0, 0.41)
plt.plot([left_normal, right_normal], [0, 0], color='#8f6100', linewidth=10, label='95% CLT-based confidence interval')
plt.legend();


[left_boot, right_boot]

[10.7159, 15.43405]


[left_normal, right_normal]

[10.50577709066518, 15.510222909334818]

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

Percent in Range	Normal Distribution
$\text{mean} \pm 1 \: \text{SD}$	$\approx 68\%$
$\text{mean} \pm 2 \: \text{SDs}$	$\approx 95\%$
$\text{mean} \pm 3 \: \text{SDs}$	$\approx 99.73\%$

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

	Bootstrap	CLT
Pro	Works for many sample statistics (mean, median, standard deviation).	Only requires 3 numbers – the sample mean, sample SD, and sample size.
Con	Very computationally expensive (requires drawing many, many samples from the original sample).	Only works for the sample mean (and sum).

Lecture 21 – The Normal Distribution, The Central Limit Theorem¶

DSC 10, Winter 2023¶

Announcements¶

Check-in ✅ – Answer at cc.dsc10.com¶

Agenda¶

Recap: Standard units¶

The normal distribution¶

Recap: The standard normal distribution¶

Using the normal distribution¶

Standard units and the normal distribution¶

Example: Proportion of heights between 65 and 70 inches¶

Checking the approximation¶

Warning: Standardization doesn't make a distribution normal!¶

Center and spread, revisited¶

Special cases¶

68% of values are within 1 SD of the mean¶

95% of values are within 2 SDs of the mean¶

Chebyshev's inequality and the normal distribution¶

Inflection points¶

Example: Inflection points¶

The Central Limit Theorem¶

Back to flight delays ✈️¶

Empirical distribution of a sample statistic¶

Empirical distribution of the sample mean¶

The Central Limit Theorem¶

Characteristics of the distribution of the sample mean¶

Changing the sample size¶

Standard deviation of the distribution of the sample mean¶

Standard deviation of the distribution of the sample mean¶

Recap: Distribution of the sample mean¶

Bootstrapping vs. the CLT¶

Estimating the distribution of the sample mean by bootstrapping¶

Using the CLT with just a single sample¶

Using the CLT with just a single sample¶

Confidence intervals¶

Confidence intervals¶

Constructing a 95% confidence interval via the bootstrap¶

Middle 95% of a normal distribution¶

Recap: Normal distributions¶

Computing a 95% confidence interval via the CLT¶

Visualizing the CLT-based confidence interval¶

Comparing confidence intervals¶

Recap: Confidence intervals for the population mean¶

Bootstrap vs. the CLT¶

Summary, next time¶

Summary¶

Next time¶

Check-in ✅ – Answer at cc.dsc10.com ¶