# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
import time
from IPython.display import display, HTML, IFrame, clear_output
import ipywidgets as widgets
import warnings
warnings.filterwarnings('ignore')
def normal_curve(x, mu=0, sigma=1):
return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))
def normal_area(a, b, bars=False):
x = np.linspace(-4, 4, 1000)
y = normal_curve(x)
ix = (x >= a) & (x <= b)
plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black')
plt.fill_between(x[ix], y[ix], color='gold')
if bars:
plt.axvline(a, color='red')
plt.axvline(b, color='red')
plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
plt.show()
def show_clt_slides():
src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000"
width = 960
height = 509
display(IFrame(src, width, height))
The Final project is due on Tuesday 3/14 at 11:59PM and has 8 sections. How much progress have you made?
A. Not started or barely started ⏳
B. Finished 1 or 2 sections
C. Finished 3 or 4 sections ❤️
D. Finished 5 or 6 sections
E. Finished 7 or 8 sections 🤯
SAT scores range from 0 to 1600. The distribution of SAT scores has a mean of 950 and a standard deviation of 300. Your friend tells you that their SAT score, in standard units, is 2.5. What do you conclude?
scipy.stats.norm.cdf
.Last time, we looked at a data set of heights and weights of 5000 adult males.
height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight
Height | Weight | |
---|---|---|
0 | 73.85 | 241.89 |
1 | 68.78 | 162.31 |
2 | 74.11 | 212.74 |
... | ... | ... |
4997 | 67.01 | 199.20 |
4998 | 71.56 | 185.91 |
4999 | 70.35 | 198.90 |
5000 rows × 2 columns
Both variables are roughly normal. What benefit is there to knowing that the two distributions are roughly normal?
Let's suppose, as is often the case, that we don't have access to the entire distribution of heights, just the mean and SD.
heights = height_and_weight.get('Height')
height_mean = heights.mean()
height_mean
69.02634590621737
height_std = np.std(heights)
height_std
2.863075878119538
Using just this information, we can estimate the proportion of heights between 65 and 70 inches:
stats.norm.cdf
to find the area between (1) and (2).left = (65 - height_mean) / height_std
left
-1.4063008029189459
right = (70 - height_mean) / height_std
right
0.3400727522534686
normal_area(left, right)
from scipy import stats
approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation
0.5532817187111865
Since we have access to the entire set of heights, we can compute the true proportion of heights between 65 and 70 inches.
# True proportion of values between 65 and 70.
height_and_weight[
(height_and_weight.get('Height') >= 65) &
(height_and_weight.get('Height') <= 70)
].shape[0] / height_and_weight.shape[0]
0.554
# Approximation using the standard normal curve.
approximation
0.5532817187111865
Pretty good for an approximation! 🤩
Consider the distribution of delays from earlier in the lecture.
delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');
The distribution above does not look normal. It won't look normal even if we standardize it. By standardizing a distribution, all we do is move it horizontally and stretch it vertically – the shape itself doesn't change.
HTML('data/delay_anim.html')
Percent in Range | Normal Distribution |
---|---|
$\text{mean} \pm 1 \: \text{SD}$ | $\approx 68\%$ |
$\text{mean} \pm 2 \: \text{SDs}$ | $\approx 95\%$ |
$\text{mean} \pm 3 \: \text{SDs}$ | $\approx 99.73\%$ |
normal_area(-1, 1, bars=True)
stats.norm.cdf(1) - stats.norm.cdf(-1)
0.6826894921370859
This means that if a variable follows a normal distribution, approximately 68% of values will be within 1 SD of the mean.
normal_area(-2, 2, bars=True)
stats.norm.cdf(2) - stats.norm.cdf(-2)
0.9544997361036416
Range | All Distributions (via Chebyshev's inequality) | Normal Distribution |
---|---|---|
mean $\pm \ 1$ SD | $\geq 0\%$ | $\approx 68\%$ |
mean $\pm \ 2$ SDs | $\geq 75\%$ | $\approx 95\%$ |
mean $\pm \ 3$ SDs | $\geq 88.8\%$ | $\approx 99.73\%$ |
Remember: The distribution of heights is roughly normal, but it is not a standard normal distribution.
height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=40, alpha=0.8, figsize=(10, 5));
plt.xticks(np.arange(60, 78, 2));
np.std(height_and_weight.get('Height'))
2.863075878119538
The distribution of flight delays that we've been looking at is not roughly normal.
delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Population Distribution of Flight Delays')
plt.xlabel('Delay (minutes)');
delays.get('Delay').describe()
count 13825.00 mean 16.66 std 39.48 ... 50% 2.00 75% 18.00 max 580.00 Name: Delay, Length: 8, dtype: float64
Since we have access to the population of flight delays, let's remind ourselves what the distribution of the sample mean looks like by drawing samples repeatedly from the population.
sample_means = np.array([])
repetitions = 2000
for i in np.arange(repetitions):
sample = delays.sample(500)
sample_mean = sample.get('Delay').mean()
sample_means = np.append(sample_means, sample_mean)
sample_means
array([15.65, 17.02, 16.58, ..., 18.76, 16.87, 13.23])
bpd.DataFrame().assign(sample_means=sample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([sample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(sample_means.mean(), color='green', label=f'mean={np.round(sample_means.mean(), 2)}', linewidth=4)
plt.xlim(5, 30)
plt.ylim(-0.013, 0.26)
plt.legend();
Notice that this distribution is roughly normal, even though the population distribution was not! This distribution is centered at the population mean.
The Central Limit Theorem (CLT) says that the probability distribution of the sum or mean of a large random sample drawn with replacement will be roughly normal, regardless of the distribution of the population from which the sample is drawn.
While the formulas we're about to introduce only work for sample means, it's important to remember that the statement above also holds true for sample sums.
The function sample_mean_delays
takes in an integer sample_size
, and:
sample_size
directly from the population.def sample_mean_delays(sample_size):
sample_means = np.array([])
for i in np.arange(2000):
sample = delays.sample(sample_size)
sample_mean = sample.get('Delay').mean()
sample_means = np.append(sample_means, sample_mean)
return sample_means
Let's call sample_mean_delays
on several values of sample_size
.
sample_means = {}
sample_sizes = [5, 10, 50, 100, 200, 400, 800, 1600]
for size in sample_sizes:
sample_means[size] = sample_mean_delays(size)
Let's look at the resulting distributions.
# Plot the resulting distributions.
bins = np.arange(5, 30, 0.5)
for size in sample_sizes:
bpd.DataFrame().assign(data=sample_means[size]).plot(kind='hist', bins=bins, density=True, ec='w', title=f'Distribution of the Sample Mean for Samples of Size {size}', figsize=(8, 4))
plt.legend('');
plt.show()
time.sleep(1.5)
if size != sample_sizes[-1]:
clear_output()
What do you notice? 🤔
# Compute the standard deviation of each distribution.
sds = np.array([])
for size in sample_sizes:
sd = np.std(sample_means[size])
sds = np.append(sds, sd)
sds
array([17.97, 12.21, 5.44, 3.93, 2.72, 1.97, 1.32, 0.93])
observed = bpd.DataFrame().assign(
SampleSize=sample_sizes,
StandardDeviation=sds
)
observed.plot(kind='scatter', x='SampleSize', y='StandardDeviation', s=70, title="Standard Deviation of the Distribution of the Sample Mean vs. Sample Size", figsize=(10, 5));
It appears that as the sample size increases, the standard deviation of the distribution of the sample mean decreases quickly.
If we were to take many, many samples of the same size from a population, and take the mean of each sample, the distribution of the sample mean will have the following characteristics:
🚨 Practical Issue: The mean and standard deviation of the distribution of the sample mean both depend on the original population, but we typically don't have access to the population!
Let's take a single sample of size 500 from delays
.
np.random.seed(42)
my_sample = delays.sample(500)
my_sample.get('Delay').describe()
count 500.00 mean 13.01 std 28.00 ... 50% 3.00 75% 16.00 max 209.00 Name: Delay, Length: 8, dtype: float64
Before today, to estimate the distribution of the sample mean using just this sample, we'd bootstrap:
resample_means = np.array([])
repetitions = 2000
for i in np.arange(repetitions):
resample = my_sample.sample(500, replace=True)
resample_mean = resample.get('Delay').mean()
resample_means = np.append(resample_means, resample_mean)
resample_means
array([12.65, 11.5 , 11.34, ..., 12.59, 11.89, 12.58])
bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([resample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(resample_means.mean(), color='green', label=f'mean={np.round(resample_means.mean(), 2)}', linewidth=4)
plt.xlim(7, 20)
plt.ylim(-0.015, 0.35)
plt.legend();
The CLT tells us what this distribution will look like, without having to bootstrap!
Suppose all we have access to in practice is a single "original sample." If we were to take many, many samples of the same size from this original sample, and take the mean of each resample, the distribution of the (re)sample mean will have the following characteristics:
Let's test this out!
Using just the original sample, my_sample
, we estimate that the distribution of the sample mean has the following mean:
samp_mean_mean = my_sample.get('Delay').mean()
samp_mean_mean
13.008
and the following standard deviation:
samp_mean_sd = np.std(my_sample.get('Delay')) / np.sqrt(my_sample.shape[0])
samp_mean_sd
1.2511114546674091
Let's draw a normal distribution with the above mean and standard deviation, and overlay the bootstrapped distribution from earlier.
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
bpd.DataFrame().assign(Bootstrapping=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='CLT')
plt.title('Distribution of the Sample Mean, Using Two Methods')
plt.xlim(7, 20)
plt.legend();
Key takeaway: Given just a single sample, we can use the CLT to estimate the distribution of the sample mean, without bootstrapping.
show_clt_slides()
Earlier, we bootstrapped my_sample
to generate 2000 resample means. One approach to computing a confidence interval for the population mean involves taking the middle 95% of this distribution.
left_boot = np.percentile(resample_means, 2.5)
right_boot = np.percentile(resample_means, 97.5)
[left_boot, right_boot]
[10.7159, 15.43405]
bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', y='resample_means', alpha=0.65, bins=20, density=True, ec='w', figsize=(10, 5), title='Distribution of Bootstrapped Sample Means');
plt.plot([left_boot, right_boot], [0, 0], color='gold', linewidth=10, label='95% bootstrap-based confidence interval');
plt.xlim(7, 20);
plt.legend();
Using the CLT and my_sample
only, we estimate that the sample mean's distribution is the following normal distribution:
plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.legend();
Question: What interval on the $x$-axis captures the middle 95% of the above distribution?
As we saw earlier, if a variable is roughly normal, then approximately 95% of its values are within 2 standard deviations of its mean.
normal_area(-2, 2)
stats.norm.cdf(2) - stats.norm.cdf(-2)
0.9544997361036416
Let's use this fact here!
my_delays = my_sample.get('Delay')
left_normal = my_delays.mean() - 2 * np.std(my_delays) / np.sqrt(500)
right_normal = my_delays.mean() + 2 * np.std(my_delays) / np.sqrt(500)
[left_normal, right_normal]
[10.50577709066518, 15.510222909334818]
plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.ylim(0, 0.41)
plt.plot([left_normal, right_normal], [0, 0], color='#8f6100', linewidth=10, label='95% CLT-based confidence interval')
plt.legend();
We've constructed two confidence intervals for the population mean:
One using bootstrapping,
[left_boot, right_boot]
[10.7159, 15.43405]
and one using the CLT.
[left_normal, right_normal]
[10.50577709066518, 15.510222909334818]
In both cases, we only used information in my_sample
, not the population.
An approximate 95% confidence interval for the population mean is given by
$$ \left[\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{\text{sample size}}}, \text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{\text{sample size}}} \right] $$This CI doesn't require bootstrapping, and it only requires three numbers – the sample mean, the sample SD, and the sample size!
The bootstrap still has its uses!
Bootstrap | CLT | |
---|---|---|
Pro | Works for many sample statistics (mean, median, standard deviation). |
Only requires 3 numbers – the sample mean, sample SD, and sample size. |
Con | Very computationally expensive (requires drawing many, many samples from the original sample). |
Only works for the sample mean (and sum). |