# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame, HTML

def show_permutation_testing_intro():
    src="https://docs.google.com/presentation/d/e/2PACX-1vT3IfZAbqXtscEPu-nTl6lWZcXh6AWfjKsXZpWDNc0UhueXsOYQ7ivShlwbn-PW1EZm7CunTLtq7rmt/embed?start=false&loop=false&delayms=60000"
    width = 960
    height = 635
    display(IFrame(src, width, height))

def show_permutation_testing_summary():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vSovXDonR6EmjrT45h4pY1mwmcKFMWVSdgpbKHC5HNTm9sbG7dojvvCDEQCjuk2dk1oA4gmwMogr8ZL/embed?start=false&loop=false&delayms=3000"
    width = 960
    height = 569
    display(IFrame(src, width, height))


jury = bpd.DataFrame().assign(
    Ethnicity=['Asian', 'Black', 'Latino', 'White', 'Other'],
    Eligible=[0.15, 0.18, 0.12, 0.54, 0.01],
    Panels=[0.26, 0.08, 0.08, 0.54, 0.04]
)
jury


def total_variation_distance(dist1, dist2):
    '''Computes the TVD between two categorical distributions, 
       assuming the categories appear in the same order.'''
    return np.abs((dist1 - dist2)).sum() / 2


jury


# Calculate the TVD between the distribution of ethnicities in the eligible population
# and the distribution of ethnicities in the observed panelists

total_variation_distance(jury.get('Eligible'), jury.get('Panels'))

0.14


jury.plot(kind='barh', x='Ethnicity', figsize=(10, 5))
plt.annotate('If you add up the total amount by which the blue bars\n are longer than the red bars, you get 0.14.', (0.08, 3.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));
plt.annotate('If you add up the total amount by which the red bars\n are longer than the blue bars, you also get 0.14!', (0.23, 0.9), bbox=dict(boxstyle="larrow,pad=0.3", fc="#e5e5e5", ec="black", lw=2));


eligible = jury.get('Eligible')
sample_distribution = np.random.multinomial(1453, eligible) / 1453 
sample_distribution

array([0.13, 0.18, 0.12, 0.56, 0.01])


total_variation_distance(sample_distribution, eligible)

0.023289745354439093


tvds = np.array([])
repetitions = 10000
for i in np.arange(repetitions):
    sample_distribution = np.random.multinomial(1453, eligible) / 1453
    new_tvd = total_variation_distance(sample_distribution, eligible)
    tvds = np.append(tvds, new_tvd)


observed_tvd = total_variation_distance(jury.get('Panels'), eligible)

bpd.DataFrame().assign(tvds=tvds).plot(kind='hist', density=True, bins=20, ec='w', figsize=(10, 5),
                                      title='Empirical Distribution of TVD Between Eligible Population and Random Sample')
plt.axvline(observed_tvd, color='black', linewidth=4, label='Observed Statistic')
plt.legend();


np.count_nonzero(tvds >= observed_tvd) / repetitions

0.0


jury.assign(RandomSample=sample_distribution).plot(kind='barh', x='Ethnicity', figsize=(10, 5),
                                                   title="A Random Sample is Usually Similar to the Eligible Population");


babies = bpd.read_csv('data/baby.csv').get(['Maternal Smoker', 'Birth Weight'])
babies


smokers = babies[babies.get('Maternal Smoker')]
non_smokers = babies[babies.get('Maternal Smoker') == False]


fig, ax = plt.subplots()
baby_bins = np.arange(50, 200, 5)
non_smokers.plot(kind='hist', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w', figsize=(10, 5))
smokers.plot(kind='hist', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w')
plt.legend(['Maternal Smoker = False', 'Maternal Smoker = True'])
plt.xlabel('Birth Weight');


means_table = babies.groupby('Maternal Smoker').mean()
means_table


# The difference between the mean birth weight for non-smokers and smokers
means = means_table.get('Birth Weight')
observed_difference = means.loc[False] - means.loc[True]
observed_difference

9.266142572024918


show_permutation_testing_intro()


data = bpd.DataFrame().assign(x=['a', 'b', 'c', 'd', 'e'], y=[1, 2, 3, 4, 5])
data


# The order of the rows are different,
# but each x is still in a row with the same y
# This is NOT what we want
data.sample(data.shape[0])


# Random!
np.random.permutation(data.get('x'))

array(['a', 'e', 'b', 'c', 'd'], dtype=object)


data.assign(shuffled_x=np.random.permutation(data.get('x')))


babies_with_shuffled = babies.assign(
    Shuffled_Labels=np.random.permutation(babies.get('Maternal Smoker'))
)
babies_with_shuffled


fig, ax = plt.subplots()
smokers = babies_with_shuffled[babies_with_shuffled.get('Shuffled_Labels')]
non_smokers = babies_with_shuffled[babies_with_shuffled.get('Shuffled_Labels') == False]
non_smokers.plot(kind='hist', y='Birth Weight', density=True, ax=ax, alpha=0.75, bins=baby_bins, ec='w', figsize=(10, 5))
smokers.plot(kind='hist',y='Birth Weight', density=True, ax=ax, alpha=0.75, bins=baby_bins)
plt.legend(['Maternal Smoker = False', 'Maternal Smoker = True'])
plt.xlabel('Birth Weight');


babies_with_shuffled.groupby('Shuffled_Labels').mean().get(['Birth Weight'])


group_means = babies_with_shuffled.groupby('Shuffled_Labels').mean().get('Birth Weight')
group_means.loc[False] - group_means.loc[True]

-0.041863583040054664


def difference_in_group_means(weights_df):
    group_means = weights_df.groupby('Shuffled_Labels').mean().get('Birth Weight')
    return group_means.loc[False] - group_means.loc[True]

difference_in_group_means(babies_with_shuffled)

-0.041863583040054664


n_repetitions = 500 # The dataset is large, so it takes too long to run if we use 5000 or 10000
differences = np.array([])

for i in np.arange(n_repetitions):
    # Step 1: Shuffle the labels
    shuffled_labels = np.random.permutation(babies.get('Maternal Smoker'))
    
    # Step 2: Put them in a DataFrame
    shuffled = babies_with_shuffled.assign(Shuffled_Labels=shuffled_labels)
    
    # Step 3: Compute the difference in group means and add the result to the differences array
    difference = difference_in_group_means(shuffled)
    
    differences = np.append(differences, difference)
    
differences

array([-0.29,  1.99,  0.  , ...,  1.29, -1.52,  0.84])


(bpd.DataFrame()
 .assign(simulated_diffs=differences)
 .plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);


(bpd.DataFrame()
 .assign(simulated_diffs=differences)
 .plot(kind='hist', bins=20, density=True, ec='w', figsize=(10, 5))
);
plt.axvline(observed_difference, color='black', linewidth=4, label='observed difference in means')
plt.legend();


smoker_p_value = np.count_nonzero(differences >= observed_difference) / n_repetitions
smoker_p_value

0.0


show_permutation_testing_summary()


babies.take(np.arange(3))

Class Standing	DSC 10	DSC 40A
Freshman	0.45	0.15
Sophomore	0.35	0.35
Junior	0.15	0.35
Senior+	0.05	0.15

	Ethnicity	Eligible	Panels
0	Asian	0.15	0.26
1	Black	0.18	0.08
2	Latino	0.12	0.08
3	White	0.54	0.54
4	Other	0.01	0.04

	Maternal Smoker	Birth Weight
0	False	120
1	False	113
2	True	128
...	...	...
1171	True	130
1172	False	125
1173	False	117

	x	y	shuffled_x
0	a	1	e
1	b	2	c
2	c	3	b
3	d	4	a
4	e	5	d

Lecture 17 – TVD, Permutation Testing¶

DSC 10, Fall 2022¶

Announcements¶

Agenda¶

Total variation distance¶

Recall: jury panels in Alameda County¶

Total variation distance¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Simulate drawing jury panels¶

The experiment¶

Repeating the experiment¶

Calculating the p-value¶

Are the jury panels representative?¶

Motivating A/B testing¶

Reflection¶

Looking forward¶

2008 Obama Campaign¶

Button choices¶

The winner¶

Example: Smoking and birth weight 👶¶

Smoking and birth weight¶

Visualizing the distribution of each group¶

The question¶

Setting up a hypothesis test¶

Discussion Question¶

Test statistic: the difference in group means¶

Setting up a hypothesis test¶

Generating new samples under the null hypothesis¶

Constructing a population¶

Permutations¶

Permutation tests¶

Permutation tests with DataFrames¶

Shuffling one column¶

How close are the means of the shuffled groups?¶

Simulation¶

Running the simulation¶

Conclusion of the test¶

Conclusion¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Summary, next time¶

Summary¶

A/B testing¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

Concept Check ✅ – Answer at cc.dsc10.com ¶