# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
from scipy import stats
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

# Animations
from IPython.display import display, IFrame, HTML
import ipywidgets as widgets

def normal_curve(x, mu=0, sigma=1):
    return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))

def normal_area(a, b, bars=False, title=None):
    x = np.linspace(-4, 4)
    y = normal_curve(x)
    ix = (x >= a) & (x <= b)
    plt.plot(x, y, color='black')
    plt.fill_between(x[ix], y[ix], color='gold')
    if bars:
        plt.axvline(a, color='red')
        plt.axvline(b, color='red')
    if title:
        plt.title(title)
    else:
        plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
    plt.show()
    
def area_within(z):
    title = f'Proportion of values within {z} SDs of the mean: {np.round(stats.norm.cdf(z) - stats.norm.cdf(-z), 4)}'
    normal_area(-z, z, title=title)

def show_clt_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTcJd3U1H1KoXqBFcWGKFUPjZbeW4oiNZZLCFY8jqvSDsl4L1rRTg7980nPs1TGCAecYKUZxH5MZIBh/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 509
    display(IFrame(src, width, height))


show_clt_slides()


np.random.seed(42)
delays = bpd.read_csv('data/delays.csv')
my_sample = delays.sample(500)


resample_means = np.array([])
repetitions = 2000

for i in np.arange(repetitions):
    resample = my_sample.sample(500, replace=True)
    resample_mean = resample.get('Delay').mean()
    resample_means = np.append(resample_means, resample_mean)
    
resample_means

array([12.65, 11.5 , 11.34, ..., 12.59, 11.89, 12.58])


bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', density=True, ec='w', alpha=0.65, bins=20, figsize=(10, 5));
plt.scatter([resample_means.mean()], [-0.005], marker='^', color='green', s=250)
plt.axvline(resample_means.mean(), color='green', label=f'mean={np.round(resample_means.mean(), 2)}', linewidth=4)
plt.xlim(7, 20)
plt.ylim(-0.015, 0.35)
plt.legend();


left_boot = np.percentile(resample_means, 2.5)
right_boot = np.percentile(resample_means, 97.5)
[left_boot, right_boot]

[10.7159, 15.43405]


bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', y='resample_means', alpha=0.65, bins=20, density=True, ec='w', figsize=(10, 5), title='Distribution of Bootstrapped Sample Means');
plt.plot([left_boot, right_boot], [0, 0], color='gold', linewidth=10, label='95% bootstrap-based confidence interval');
plt.xlim(7, 20);
plt.legend();


samp_mean_mean = my_sample.get('Delay').mean()
samp_mean_mean

13.008


samp_mean_sd = np.std(my_sample.get('Delay')) / np.sqrt(my_sample.shape[0])
samp_mean_sd

1.2511114546674091


plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.legend();


normal_area(-2, 2)


stats.norm.cdf(2) - stats.norm.cdf(-2)

0.9544997361036416


my_delays = my_sample.get('Delay')
left_normal = my_delays.mean() - 2 * np.std(my_delays) / np.sqrt(500)
right_normal = my_delays.mean() + 2 * np.std(my_delays) / np.sqrt(500)
[left_normal, right_normal]

[10.50577709066518, 15.510222909334818]


plt.figure(figsize=(10, 5))
norm_x = np.linspace(7, 20)
norm_y = normal_curve(norm_x, mu=samp_mean_mean, sigma=samp_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(7, 20)
plt.ylim(0, 0.41)
plt.plot([left_normal, right_normal], [0, 0], color='#8f6100', linewidth=10, label='95% CLT-based confidence interval')
plt.legend();


[left_boot, right_boot]

[10.7159, 15.43405]


[left_normal, right_normal]

[10.50577709066518, 15.510222909334818]


z = widgets.FloatSlider(value=2, min=0,max=4,step=0.05, description='z')
ui = widgets.HBox([z])
out = widgets.interactive_output(area_within, {'z': z})
display(ui, out)


temperatures = bpd.read_csv('data/temp.csv')
temperatures


temperatures.get('temperature').describe()

count    130.00
mean      98.25
std        0.73
          ...  
50%       98.30
75%       98.70
max      100.80
Name: temperature, Length: 8, dtype: float64


sample_mean = temperatures.get('temperature').mean()
sample_mean

98.24923076923078


sample_mean_sd = np.std(temperatures.get('temperature')) / np.sqrt(temperatures.shape[0])
sample_mean_sd

0.06405661469519337


# 95% confidence interval for the mean body temperature of all people:
[sample_mean - 2 * sample_mean_sd, sample_mean + 2 * sample_mean_sd]

[98.12111753984038, 98.37734399862117]


plt.figure(figsize=(10, 5))
plt.hist(temperatures.get('temperature'), density=True, bins=20, ec='w');
plt.title('Sample Distribution of Body Temperature (ºF)');
plt.plot([sample_mean - 2*sample_mean_sd, sample_mean + 2*sample_mean_sd], [0, 0], color='gold', linewidth=20, label='95% CLT-based confidence interval')
plt.legend();


# 95% confidence interval for the mean body temperature of all people:
[sample_mean - 2 * sample_mean_sd, sample_mean + 2 * sample_mean_sd]

[98.12111753984038, 98.37734399862117]


# Plot the SD of a collection of 0s and 1s with p proportion of Os.
p = np.arange(0, 1.01, 0.01)
sd = np.sqrt(p * (1 - p))
plt.plot(p, sd)
plt.xlabel('p')
plt.ylabel(r'$\sqrt{p(1-p)}$');


(4 * 0.5 / 0.06) ** 2

1111.1111111111113

	Bootstrap	CLT
Pro	Works for many sample statistics (mean, median, standard deviation).	Only requires 3 numbers – the sample mean, sample SD, and sample size.
Con	Very computationally expensive (requires drawing many, many samples from the original sample).	Only works for the sample mean (and sum).

	temperature
0	96.3
1	96.7
2	96.9
...	...
127	99.9
128	100.0
129	100.8

Lecture 23 – The Central Limit Theorem, Choosing Sample Sizes¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

The Central Limit Theorem¶

The Central Limit Theorem¶

Confidence intervals¶

Confidence intervals¶

Constructing a 95% confidence interval through bootstrapping¶

Middle 95% of a normal distribution¶

Recall: Normal approximations¶

Computing a 95% confidence interval using the CLT¶

Visualizing the CLT-based confidence interval¶

Comparing confidence intervals¶

Recap: Confidence intervals for the population mean¶

Bootstrapping vs. the CLT¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Hypothesis testing, revisited¶

Hypothesis testing for the mean¶

Using a confidence interval for hypothesis testing¶

Example: Body temperature 🌡¶

Setting up a hypothesis test¶

CI for mean body temperature¶

Conclusion¶

Choosing sample sizes¶

Example: Polling¶

Aside: Proportions are just means¶

Our strategy¶

Our strategy¶

Upper bound for the standard deviation of a sample¶

Choosing a sample size¶

Choosing a sample size¶

Activity¶

Summary, next time¶

Summary¶

What we've learned about inference¶

Next time¶

Lecture 23 – The Central Limit Theorem, Choosing Sample Sizes¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

The Central Limit Theorem¶

The Central Limit Theorem¶

Confidence intervals¶

Confidence intervals¶

Constructing a 95% confidence interval through bootstrapping¶

Middle 95% of a normal distribution¶

Recall: Normal approximations¶

Computing a 95% confidence interval using the CLT¶

Visualizing the CLT-based confidence interval¶

Comparing confidence intervals¶

Recap: Confidence intervals for the population mean¶

Bootstrapping vs. the CLT¶

Activity¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Hypothesis testing, revisited¶

Hypothesis testing for the mean¶

Using a confidence interval for hypothesis testing¶

Example: Body temperature 🌡¶

Setting up a hypothesis test¶

CI for mean body temperature¶

Conclusion¶

Choosing sample sizes¶

Example: Polling¶

Aside: Proportions are just means¶

Our strategy¶

Our strategy¶

Upper bound for the standard deviation of a sample¶

Choosing a sample size¶

Choosing a sample size¶

Activity¶

Summary, next time¶

Summary¶

What we've learned about inference¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶