# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
import time
from IPython.display import display, HTML, IFrame, clear_output
import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')

def normal_curve(x, mu=0, sigma=1):
    return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))

def normal_area(a, b, bars=False):
    x = np.linspace(-4, 4, 1000)
    y = normal_curve(x)
    ix = (x >= a) & (x <= b)
    plt.figure(figsize=(10, 5))
    plt.plot(x, y, color='black')
    plt.fill_between(x[ix], y[ix], color='gold')
    if bars:
        plt.axvline(a, color='red')
        plt.axvline(b, color='red')
    plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
    plt.show()

def show_clt_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))


normal_area(-1, 1)


height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight


height_and_weight.plot(kind='hist', density=True, ec='w', bins=60, alpha=0.8, figsize=(10, 5));


heights = height_and_weight.get('Height')
height_mean = heights.mean()
height_mean

69.02634590621741


height_std = np.std(heights)
height_std

2.863075878119538


left = (65 - height_mean) / height_std
left

-1.406300802918961


right = (70 - height_mean) / height_std
right

0.34007275225345374


normal_area(left, right)


from scipy import stats
approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation

0.5532817187111831


# True proportion of values between 65 and 70.
height_and_weight[
    (height_and_weight.get('Height') >= 65) &
    (height_and_weight.get('Height') <= 70)
].shape[0] / height_and_weight.shape[0]

0.554


# Approximation using the standard normal curve.
approximation

0.5532817187111831


normal_area(-1, 1, bars=True)


stats.norm.cdf(1) - stats.norm.cdf(-1)

0.6826894921370859


normal_area(-2, 2, bars=True)


stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416


normal_area(-1, 1)


height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=40, alpha=0.8, figsize=(10, 5));
plt.xticks(np.arange(60, 78, 2));


np.std(height_and_weight.get('Height'))

2.863075878119538


delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Population Distribution of Flight Delays')
plt.xlabel('Delay (minutes)');


delays.get('Delay').describe()

count    13825.00
mean        16.66
std         39.48
           ...   
50%          2.00
75%         18.00
max        580.00
Name: Delay, Length: 8, dtype: float64


sample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    sample = delays.sample(500)
    sample_mean = sample.get('Delay').mean()
    sample_means = np.append(sample_means, sample_mean)
    
sample_means

array([17.06, 16.39, 16.58, ..., 12.78, 16.21, 15.36])


bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([sample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(sample_means.mean(), color='green', label=f'mean={np.round(sample_means.mean(), 2)}', linewidth=4)
plt.xlim(5, 30)
plt.ylim(-0.013, 0.26)
plt.legend();


def sample_mean_delays(sample_size):
    sample_means = np.array([])
    for i in np.arange(2000):
        sample = delays.sample(sample_size)
        sample_mean = sample.get('Delay').mean()
        sample_means = np.append(sample_means, sample_mean)
    return sample_means


sample_means = {}
sample_sizes = [5, 10, 50, 100, 200, 400, 800, 1600]

for size in sample_sizes:
    sample_means[size] = sample_mean_delays(size)


# Plot the resulting distributions.
bins = np.arange(5, 30, 0.5)
for size in sample_sizes:
    bpd.DataFrame().assign(data=sample_means[size]).plot(kind='hist', bins=bins, density=True, ec='w', title=f'Distribution of the Sample Mean for Samples of Size {size}', figsize=(8, 4))
    plt.legend('');
    plt.show()
    time.sleep(1.5)
    if size != sample_sizes[-1]:
        clear_output()


# Compute the standard deviation of each distribution.
sds = np.array([])
for size in sample_sizes:
    sd = np.std(sample_means[size])
    sds = np.append(sds, sd)
sds

array([17.78, 12.25,  5.69,  3.93,  2.77,  1.99,  1.36,  0.91])


observed = bpd.DataFrame().assign(
    SampleSize=sample_sizes,
    StandardDeviation=sds
)

observed.plot(kind='scatter', x='SampleSize', y='StandardDeviation', s=70, title="Standard Deviation of the Distribution of the Sample Mean vs. Sample Size", figsize=(10, 5));


np.random.seed(42)
my_sample = delays.sample(500)
my_sample.get('Delay').describe()

count    500.00
mean      13.01
std       28.00
          ...  
50%        3.00
75%       16.00
max      209.00
Name: Delay, Length: 8, dtype: float64


resample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    resample = my_sample.sample(500, replace=True)
    resample_mean = resample.get('Delay').mean()
    resample_means = np.append(resample_means, resample_mean)
    
resample_means

array([12.65, 11.5 , 11.34, ..., 12.59, 11.89, 12.58])


bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([resample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(resample_means.mean(), color='green', label=f'mean={np.round(resample_means.mean(), 2)}', linewidth=4)
plt.xlim(7, 20)
plt.ylim(-0.015, 0.35)
plt.legend();


samp_mean_mean = my_sample.get('Delay').mean()
samp_mean_mean

13.008


samp_mean_sd = np.std(my_sample.get('Delay')) / np.sqrt(my_sample.shape[0])
samp_mean_sd

1.2511114546674091


norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
bpd.DataFrame().assign(Bootstrapping=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='CLT')
plt.title('Distribution of the Sample Mean, Using Two Methods')
plt.xlim(7, 20)
plt.legend();


show_clt_slides()

	Height	Weight
0	73.85	241.89
1	68.78	162.31
2	74.11	212.74
...	...	...
4997	67.01	199.20
4998	71.56	185.91
4999	70.35	198.90

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

Range	All Distributions (via Chebyshev's inequality)	Normal Distribution
mean $\pm \ 1$ SD	$\geq 0\%$	$\approx 68\%$
mean $\pm \ 2$ SDs	$\geq 75\%$	$\approx 95\%$
mean $\pm \ 3$ SDs	$\geq 88.8\%$	$\approx 99.73\%$

	Bootstrapping	CLT
Pro	Works for many sample statistics (mean, median, standard deviation).	Only requires 3 numbers – the sample mean, sample SD, and sample size.
Con	Very computationally expensive (requires drawing many, many samples from the original sample).	Only works for the sample mean (and sum).

Lecture 22 – The Normal Distribution, The Central Limit Theorem¶

DSC 10, Spring 2023¶

Announcements¶

Agenda¶

The normal distribution¶

Recap: Standard units¶

Recap: The standard normal distribution¶

Using the normal distribution¶

Example: Proportion of heights between 65 and 70 inches¶

Checking the approximation¶

Center and spread, revisited¶

Chebyshev's inequality and the normal distribution¶

68% of values are within 1 SD of the mean¶

95% of values are within 2 SDs of the mean¶

Recap: Proportion of values within $z$ SDs of the mean¶

Inflection points¶

Example: Inflection points¶

The Central Limit Theorem¶

Back to flight delays ✈️¶

Empirical distribution of a sample statistic¶

Empirical distribution of the sample mean¶

The Central Limit Theorem¶

Characteristics of the distribution of the sample mean¶

Changing the sample size¶

Standard deviation of the distribution of the sample mean¶

Standard deviation of the distribution of the sample mean¶

Recap: Distribution of the sample mean¶

Bootstrapping vs. the CLT¶

Estimating the distribution of the sample mean by bootstrapping¶

Using the CLT with just a single sample¶

Using the CLT with just a single sample¶

Why?¶

Bootstrapping vs. the CLT¶

Summary, next time¶

Summary¶

Next time¶