# Run this cell to set up packages for lecture.
from lec21_imports import *

jury = bpd.DataFrame().assign(
    Ethnicity=['Asian', 'Black', 'Latino', 'White', 'Other'],
    Eligible=[0.15, 0.18, 0.12, 0.54, 0.01],
    Panels=[0.26, 0.08, 0.08, 0.54, 0.04]
)
jury

def total_variation_distance(dist1, dist2):
    '''Computes the TVD between two categorical distributions, 
       assuming the categories appear in the same order.'''
    return np.abs((dist1 - dist2)).sum() / 2

jury

# Calculate the TVD between the distribution of ethnicities in the eligible population
# and the distribution of ethnicities in the observed panelists.

total_variation_distance(jury.get('Eligible'), jury.get('Panels'))

0.14

jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5))
plt.annotate('If you add up the total amount by which the blue bars\n are longer than the red bars, you get 0.14.', (0.08, 3.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));
plt.annotate('If you add up the total amount by which the red bars\n are longer than the blue bars, you also get 0.14!', (0.23, 0.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));

eligible = jury.get('Eligible')
sample_distribution = np.random.multinomial(1453, eligible) / 1453 
sample_distribution

array([0.15, 0.16, 0.11, 0.56, 0.01])

total_variation_distance(sample_distribution, eligible)

0.023331039229180985

tvds = np.array([])
repetitions = 10000
for i in np.arange(repetitions):
    sample_distribution = np.random.multinomial(1453, eligible) / 1453
    new_tvd = total_variation_distance(sample_distribution, eligible)
    tvds = np.append(tvds, new_tvd)

observed_tvd = total_variation_distance(jury.get('Panels'), eligible)

bpd.DataFrame().assign(tvds=tvds).plot(kind='hist', density=True, bins=20, ec='w', figsize=(10, 5),
                                      title='Empirical Distribution of TVD Between Eligible Population and Random Sample')
plt.axvline(observed_tvd, color='black', linewidth=4, label='Observed Statistic')
plt.legend();

np.count_nonzero(tvds >= observed_tvd) / repetitions

0.0

jury.assign(RandomSample=sample_distribution).plot(kind='barh', x='Ethnicity', figsize=(10, 5),
                                                   title = "A Random Sample is Usually Similar to the Eligible Population");

temperatures = bpd.read_csv('data/temp.csv')
temperatures

temperatures.get('temperature').describe()

count    130.00
mean      98.25
std        0.73
          ...  
50%       98.30
75%       98.70
max      100.80
Name: temperature, Length: 8, dtype: float64

resample_means = np.array([])
for i in np.arange(10000):
    resample = temperatures.sample(130, replace=True)
    resample_means = np.append(resample_means, resample.get('temperature').mean())
resample_means

array([98.18, 98.23, 98.2 , ..., 98.26, 98.26, 98.26])

left_boot = np.percentile(resample_means, 2.5)
right_boot = np.percentile(resample_means, 97.5)

# 95% bootstrap-based confidence interval for the mean body temperature of all people:
[left_boot, right_boot]

[98.12307692307692, 98.3723076923077]

sample_mean_mean = temperatures.get('temperature').mean()
sample_mean_mean

98.24923076923078

sample_mean_sd = np.std(temperatures.get('temperature')) / np.sqrt(temperatures.shape[0])
sample_mean_sd

0.06405661469519337

left_normal = sample_mean_mean - 2 * sample_mean_sd
right_normal = sample_mean_mean + 2 * sample_mean_sd

# 95% CLT-based confidence interval for the mean body temperature of all people:
[left_normal, right_normal]

[98.12111753984038, 98.37734399862117]

def normal_curve(x, mu=0, sigma=1):
    return (1 / np.sqrt(2 * np.pi * sigma ** 2)) * np.exp((- (x - mu) ** 2) / (2 * sigma ** 2))

bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', y='resample_means', alpha=0.65, bins=20, density=True, ec='w', figsize=(10, 5), title='Distribution of Bootstrapped Sample Means');
plt.plot([left_boot, right_boot], [0, 0], color='gold', linewidth=10, label='95% bootstrap-based confidence interval');

norm_x = np.linspace(98, 98.7)
norm_y = normal_curve(norm_x, mu=sample_mean_mean, sigma=sample_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(98, 98.7)
plt.plot([left_normal, right_normal], [-0.3, -0.3], color='#8f6100', linewidth=10, label='95% CLT-based confidence interval')


plt.legend();

babies = bpd.read_csv('data/baby.csv').get(['Maternal Smoker', 'Birth Weight'])
babies

smokers = babies[babies.get('Maternal Smoker')]
non_smokers = babies[babies.get('Maternal Smoker') == False]

fig, ax = plt.subplots()
baby_bins = np.arange(50, 200, 5)
non_smokers.plot(kind='hist', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w', figsize=(10, 5))
smokers.plot(kind='hist', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w')
plt.legend(['Maternal Smoker = False', 'Maternal Smoker = True'])
plt.xlabel('Birth Weight');

means_df = babies.groupby('Maternal Smoker').mean()
means_df

# The difference between the mean birth weight for non-smokers and smokers.
means = means_df.get('Birth Weight')
observed_difference = means.loc[False] - means.loc[True]
observed_difference

9.266142572024918

show_permutation_testing_intro()

babies.take(np.arange(3))

Class Standing	DSC 10	DSC 40A
Freshman	0.45	0.15
Sophomore	0.35	0.35
Junior	0.15	0.35
Senior+	0.05	0.15

	temperature
0	96.3
1	96.7
2	96.9
...	...
127	99.9
128	100.0
129	100.8

	Ethnicity	Eligible	Panels
0	Asian	0.15	0.26
1	Black	0.18	0.08
2	Latino	0.12	0.08
3	White	0.54	0.54
4	Other	0.01	0.04

	Maternal Smoker	Birth Weight
0	False	120
1	False	113
2	True	128
...	...	...
1171	True	130
1172	False	125
1173	False	117

Lecture 21 – TVD, Hypothesis Testing, and Permutation Testing¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Total variation distance¶

Recall: Jury panels in Alameda County¶

Are the differences in representation meaningful?¶

Total variation distance¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Simulate drawing jury panels¶

The simulation¶

Repeating the experiment¶

Calculating the p-value¶

Are the jury panels representative?¶

Confidence intervals for hypothesis testing¶

Hypothesis testing, another way¶

Example: Body temperature 🌡¶

Setting up a hypothesis test¶

Bootstrap-based confidence interval for mean body temperature¶

CLT-based confidence interval for mean body temperature¶

Comparing intervals and interpreting the results¶

What happened?¶

Quiz 5 material ends here!¶

Comparing two samples¶

Reflection¶

2008 Obama Campaign¶

Button choices¶

The winner¶

Example: Smoking and birth weight 👶¶

Smoking and birth weight¶

Visualizing the distribution of each group¶

The question¶

Setting up a hypothesis test¶

Discussion Question¶

Test statistic: the difference in group means¶

Setting up a hypothesis test¶

Generating new samples under the null hypothesis¶

Constructing a population¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

A/B testing¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶