# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("png")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame, HTML

def show_permutation_testing_intro():
    src="https://docs.google.com/presentation/d/e/2PACX-1vT3IfZAbqXtscEPu-nTl6lWZcXh6AWfjKsXZpWDNc0UhueXsOYQ7ivShlwbn-PW1EZm7CunTLtq7rmt/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 965
    height = 635
    display(IFrame(src, width, height))
    
def show_permutation_testing_summary():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vSovXDonR6EmjrT45h4pY1mwmcKFMWVSdgpbKHC5HNTm9sbG7dojvvCDEQCjuk2dk1oA4gmwMogr8ZL/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 569
    display(IFrame(src, width, height))


jury = bpd.DataFrame().assign(
    Ethnicity=['Asian', 'Black', 'Latino', 'White', 'Other'],
    Eligible=[0.15, 0.18, 0.12, 0.54, 0.01],
    Panels=[0.26, 0.08, 0.08, 0.54, 0.04]
)
jury


def total_variation_distance(dist1, dist2):
    '''Computes the TVD between two categorical distributions, 
       assuming the categories appear in the same order.'''
    return np.abs((dist1 - dist2)).sum() / 2


jury


# Calculate the TVD between the distribution of ethnicities in the eligible population
# and the distribution of ethnicities in the observed panelists.

total_variation_distance(jury.get('Eligible'), jury.get('Panels'))

0.14


jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5))
plt.annotate('If you add up the total amount by which the blue bars\n are longer than the red bars, you get 0.14.', (0.08, 3.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));
plt.annotate('If you add up the total amount by which the red bars\n are longer than the blue bars, you also get 0.14!', (0.23, 0.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));


eligible = jury.get('Eligible')
sample_distribution = np.random.multinomial(1453, eligible) / 1453 
sample_distribution

array([0.16, 0.17, 0.13, 0.53, 0.02])


total_variation_distance(sample_distribution, eligible)

0.024886441844459747


tvds = np.array([])
repetitions = 10000
for i in np.arange(repetitions):
    sample_distribution = np.random.multinomial(1453, eligible) / 1453
    new_tvd = total_variation_distance(sample_distribution, eligible)
    tvds = np.append(tvds, new_tvd)


observed_tvd = total_variation_distance(jury.get('Panels'), eligible)

bpd.DataFrame().assign(tvds=tvds).plot(kind='hist', density=True, bins=20, ec='w', figsize=(10, 5),
                                      title='Empirical Distribution of TVD Between Eligible Population and Random Sample')
plt.axvline(observed_tvd, color='black', linewidth=4, label='Observed Statistic')
plt.legend();


np.count_nonzero(tvds >= observed_tvd) / repetitions

0.0


jury.assign(RandomSample=sample_distribution).plot(kind='barh', x='Ethnicity', figsize=(10, 5),
                                                   title = "A Random Sample is Usually Similar to the Eligible Population");


temperatures = bpd.read_csv('data/temp.csv')
temperatures


temperatures.get('temperature').describe()

count    130.00
mean      98.25
std        0.73
          ...  
50%       98.30
75%       98.70
max      100.80
Name: temperature, Length: 8, dtype: float64


resample_means = np.array([])
for i in np.arange(10000):
    resample = temperatures.sample(130, replace=True)
    resample_means = np.append(resample_means, resample.get('temperature').mean())
resample_means

array([98.2 , 98.29, 98.29, ..., 98.31, 98.17, 98.37])


left_boot = np.percentile(resample_means, 2.5)
right_boot = np.percentile(resample_means, 97.5)

# 95% bootstrap-based confidence interval for the mean body temperature of all people:
[left_boot, right_boot]

[98.11999999999999, 98.37538461538459]


sample_mean_mean = temperatures.get('temperature').mean()
sample_mean_mean

98.24923076923076


sample_mean_sd = np.std(temperatures.get('temperature')) / np.sqrt(temperatures.shape[0])
sample_mean_sd

0.06405661469519337


left_normal = sample_mean_mean - 2 * sample_mean_sd
right_normal = sample_mean_mean + 2 * sample_mean_sd

# 95% CLT-based confidence interval for the mean body temperature of all people:
[left_normal, right_normal]

[98.12111753984037, 98.37734399862116]


def normal_curve(x, mu=0, sigma=1):
    return (1 / np.sqrt(2 * np.pi * sigma ** 2)) * np.exp((- (x - mu) ** 2) / (2 * sigma ** 2))

bpd.DataFrame().assign(resample_means=resample_means).plot(kind='hist', y='resample_means', alpha=0.65, bins=20, density=True, ec='w', figsize=(10, 5), title='Distribution of Bootstrapped Sample Means');
plt.plot([left_boot, right_boot], [0, 0], color='gold', linewidth=10, label='95% bootstrap-based confidence interval');

norm_x = np.linspace(98, 98.7)
norm_y = normal_curve(norm_x, mu=sample_mean_mean, sigma=sample_mean_sd)
plt.plot(norm_x, norm_y, color='black', linestyle='--', linewidth=4, label='Distribution of the Sample Mean (via the CLT)')
plt.xlim(98, 98.7)
plt.plot([left_normal, right_normal], [-0.3, -0.3], color='#8f6100', linewidth=10, label='95% CLT-based confidence interval')


plt.legend();


babies = bpd.read_csv('data/baby.csv').get(['Maternal Smoker', 'Birth Weight'])
babies


smokers = babies[babies.get('Maternal Smoker')]
non_smokers = babies[babies.get('Maternal Smoker') == False]


fig, ax = plt.subplots()
baby_bins = np.arange(50, 200, 5)
non_smokers.plot(kind='hist', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w', figsize=(10, 5))
smokers.plot(kind='hist', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w')
plt.legend(['Maternal Smoker = False', 'Maternal Smoker = True'])
plt.xlabel('Birth Weight');


means_df = babies.groupby('Maternal Smoker').mean()
means_df


# The difference between the mean birth weight for non-smokers and smokers.
means = means_df.get('Birth Weight')
observed_difference = means.loc[False] - means.loc[True]
observed_difference

9.266142572024918


show_permutation_testing_intro()


babies.take(np.arange(3))

Class Standing	DSC 10	DSC 40A
Freshman	0.45	0.15
Sophomore	0.35	0.35
Junior	0.15	0.35
Senior+	0.05	0.15

	temperature
0	96.3
1	96.7
2	96.9
...	...
127	99.9
128	100.0
129	100.8

	Ethnicity	Eligible	Panels
0	Asian	0.15	0.26
1	Black	0.18	0.08
2	Latino	0.12	0.08
3	White	0.54	0.54
4	Other	0.01	0.04

	Maternal Smoker	Birth Weight
0	False	120
1	False	113
2	True	128
...	...	...
1171	True	130
1172	False	125
1173	False	117

Lecture 22 – TVD, Hypothesis Testing, and Permutation Testing¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Total variation distance¶

Recall: Jury panels in Alameda County¶

Are the differences in representation meaningful?¶

Total variation distance¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Simulate drawing jury panels¶

The simulation¶

Repeating the experiment¶

Calculating the p-value¶

Are the jury panels representative?¶

Confidence intervals for hypothesis testing¶

Hypothesis testing, another way¶

Example: Body temperature 🌡¶

Setting up a hypothesis test¶

Bootstrap-based confidence interval for mean body temperature¶

CLT-based confidence interval for mean body temperature¶

Comparing intervals and interpreting the results¶

What happened?¶

Comparing two samples¶

Reflection¶

2008 Obama Campaign¶

Button choices¶

The winner¶

Example: Smoking and birth weight 👶¶

Smoking and birth weight¶

Visualizing the distribution of each group¶

The question¶

Setting up a hypothesis test¶

Discussion Question¶

Test statistic: the difference in group means¶

Setting up a hypothesis test¶

Generating new samples under the null hypothesis¶

Constructing a population¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

A/B testing¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶