from dsc80_utils import *

from scipy.special import comb

def p_k_heads(k):
    return comb(100, k) * (0.5) ** 100

sum([p_k_heads(k) for k in range(59, 101)])

0.04431304005703377

# Also, this line tells pandas to generate plotly plots by default!
pd.options.plotting.backend = 'plotly'

plot_df = pd.DataFrame().assign(k = range(101))
plot_df['p_k'] = p_k_heads(plot_df['k'])
plot_df['color'] = plot_df['k'].apply(lambda k: 'orange' if k >= 59 else 'blue')

fig = plot_df.plot(kind='bar', x='k', y='p_k', color='color', width=1000)
fig
# fig.add_annotation(text='This orange area is the p-value!', x=77, y=0.008, showarrow=False)

# Flipping a fair coin 100 times.
# Interpret the result as [Heads, Tails].
np.random.multinomial(100, [0.5, 0.5])

array([53, 47])

# 100,000 times, we want to flip a coin 100 times.
results = []

for _ in range(100_000):
    num_heads = np.random.multinomial(100, [0.5, 0.5])[0]
    results.append(num_heads)

results[:10]

[50, 44, 48, 47, 47, 54, 38, 47, 48, 48]

fig = px.histogram(pd.DataFrame(results), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of # Heads in 100 Flips of a Fair Coin')
fig.add_vline(x=59, line_color='red')
fig.update_layout(xaxis_range=[0, 100])

(np.array(results) >= 59).mean()

0.04391

# An array with 100000 rows and 2 columns.
np.random.multinomial(100, [0.5, 0.5], size=100_000)

array([[55, 45],
       [45, 55],
       [48, 52],
       ...,
       [53, 47],
       [50, 50],
       [50, 50]])

# Just the first column of the above array. Note the iloc-like syntax.
np.random.multinomial(100, [0.5, 0.5], size=100_000)[:, 0]

array([45, 55, 38, ..., 53, 47, 50])

%%time

faster_results = np.random.multinomial(100, [0.5, 0.5], size=100_000)[:, 0]

CPU times: user 13 ms, sys: 1.67 ms, total: 14.7 ms
Wall time: 12.6 ms

%%time

# 100,000 times, we want to flip a coin 100 times.
results = []

for _ in range(100_000):
    num_heads = np.random.multinomial(100, [0.5, 0.5])[0]
    results.append(num_heads)

CPU times: user 1.61 s, sys: 19.5 ms, total: 1.63 s
Wall time: 1.62 s

eth = pd.DataFrame(
    [['Asian', 0.15, 0.51],
     ['Black', 0.05, 0.02],
     ['Latino', 0.39, 0.16],
     ['White', 0.35, 0.2],
     ['Other', 0.06, 0.11]],
    columns=['Ethnicity', 'California', 'UCSD']
).set_index('Ethnicity')

eth

eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD', barmode='group')

def total_variation_distance(dist1, dist2):
    '''Given two categorical distributions, 
    both sorted with same categories, calculates the TVD'''
    return np.sum(np.abs(dist1 - dist2)) / 2

observed_tvd = total_variation_distance(eth['UCSD'], eth['California'])
observed_tvd

0.41000000000000003

# Number of students at UCSD in this example.
N_STUDENTS = 30_000

eth['California']

Ethnicity
Asian     0.15
Black     0.05
Latino    0.39
White     0.35
Other     0.06
Name: California, dtype: float64

np.random.multinomial(N_STUDENTS, eth['California'])

array([ 4478,  1538, 11699, 10543,  1742])

np.random.multinomial(N_STUDENTS, eth['California']) / N_STUDENTS

array([0.15, 0.05, 0.39, 0.35, 0.06])

num_reps = 100_000
eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=num_reps) / N_STUDENTS
eth_draws

array([[0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       ...,
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06]])

eth_draws.shape

(100000, 5)

tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
tvds

array([0.  , 0.  , 0.  , ..., 0.01, 0.  , 0.  ])

# Note that this is the same as the first element in tvds.
total_variation_distance(eth_draws[0], eth['California'])

0.003033333333333346

observed_tvd

0.41000000000000003

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=20, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
# fig.add_vline(x=observed_tvd, line_color='red')
fig

(np.array(tvds) >= observed_tvd).mean()

0.0

baby = pd.read_csv('data/babyweights.csv')
baby

baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()

baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])

px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
             title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)

px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
             title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)

group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

group_means.loc[True] - group_means.loc[False]

-9.266142572024918

pd.Series([1, 2, -3]).diff()

0    NaN
1    1.0
2   -5.0
dtype: float64

group_means.diff()

Maternal Smoker
False     NaN
True    -9.27
Name: Birth Weight, dtype: float64

group_means.diff().iloc[-1]

-9.266142572024918

# If we wanted to do (non-smokers' mean - smokers' mean). 
# Think about why this is the case (hint: it has to do with how the resulting DataFrame after grouping is sorted).
group_means[::-1].diff().iloc[-1]

9.266142572024918

baby.head()

np.random.permutation(baby['Birth Weight'])

array([102, 133, 133, ..., 122,  96, 152])

with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()

group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means

for x in ['Birth Weight', 'Shuffled_Weights']:
    fig = px.histogram(
        with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
        title=f"Using the {x} column <br>(difference in means = {group_means[x].diff().iloc[-1]:.2f})",
        barmode='overlay', opacity=0.7)
    fig.update_layout(margin=dict(t=60))
    fig.show()

n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, False (0) comes before True (1),
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.diff().iloc[-1]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[0.8896719837896256,
 0.9755259990554066,
 -0.23000746530158267,
 -0.4732605085546169,
 1.5299998476469057,
 -0.015372427137137379,
 -0.4517970047381823,
 0.9969895028718554,
 0.3530843883785053,
 -0.3874064932888359]

observed_difference = baby.groupby('Maternal Smoker')['Birth Weight'].mean().diff().iloc[-1]
observed_difference

-9.266142572024918

fig = px.histogram(
    pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
    title='Empirical Distribution of the Mean Differences <br> in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-10, 10], margin=dict(t=60))

couples = pd.read_csv('data/married_couples.csv')
couples.head()

# What does this expression compute?
couples['hh_id'].value_counts().value_counts()

2    1033
1       2
Name: hh_id, dtype: int64

couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()

couples.head()

empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]

couples = couples.replace({
    'mar_status': {1: 'married', 2: 'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})

couples.head()

# For categorical columns, this shows the 10 most common values and their frequencies.
# For numerical columns, this shows the result of calling the .describe() method.
for col in couples:
    if couples[col].dtype == 'object':
        empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    else:
        empr = couples[col].describe().to_frame()
    display(empr)

px.histogram(couples, x='age', color='mar_status', histnorm='probability', marginal='box',
             barmode='overlay', opacity=0.7)

couples.sample(5).head()

# Note that this is a shortcut to picking a column for values and using aggfunc='count'.
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts

empl_cnts.sum()

mar_status
married      1484
unmarried     584
dtype: int64

cond_distr = empl_cnts / empl_cnts.sum()
cond_distr

cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type', barmode='group')

cond_distr

(cond_distr['unmarried'] - cond_distr['married']).abs().sum() / 2

0.1269754089281099

def tvd_of_groups(df, groups, cats):
    '''groups: the binary column (e.g. married vs. unmarried).
       cats: the categorical column (e.g. employment status).
    '''
    cnts = df.pivot_table(index=cats, columns=groups, aggfunc='size')
    # Normalize each column.
    distr = cnts / cnts.sum()
    # Compute and return the TVD.
    return (distr['unmarried'] - distr['married']).abs().sum() / 2

# Same result as above.
observed_tvd = tvd_of_groups(couples, groups='mar_status', cats='empl_status')
observed_tvd

0.1269754089281099

couples.head()

couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))

N = 1000
tvds = []

for _ in range(N):
    # Shuffle marital statuses.
    with_shuffled = couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))
    
    # Compute and store the TVD.
    tvd = tvd_of_groups(with_shuffled, groups='shuffled_mar', cats='empl_status')
    tvds.append(tvd)

fig = px.histogram(tvds, x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.update_layout(xaxis_range=[0, 0.2])

def tvd(a, b):
    return np.sum(np.abs(a - b)) / 2

	age
count	2068.00
mean	43.17
std	11.91
...	...
50%	44.00
75%	53.00
max	64.00

	California	UCSD
Ethnicity
Asian	0.15	0.51
Black	0.05	0.02
Latino	0.39	0.16
White	0.35	0.20
Other	0.06	0.11

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
...	...	...	...	...	...	...
1171	130	291	30	65	150	True
1172	125	281	21	65	110	False
1173	117	297	38	65	129	False

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

	mar_status	empl_status	gender	age
0	married	Working as paid employee	M	51
1	married	Working as paid employee	F	53
2	married	Working as paid employee	M	57
3	married	Working as paid employee	F	57
4	married	Working as paid employee	M	60

	empl_status
Working as paid employee	0.61
Not working - other	0.10
Working, self-employed	0.10
Not working - looking for work	0.07
Not working - disabled	0.06
Not working - retired	0.05
Not working - on a temporary layoff from a job	0.02

	mar_status	empl_status	gender	age
1497	unmarried	Working as paid employee	F	30
1488	married	Working as paid employee	M	42
1080	married	Working as paid employee	M	61
467	married	Working as paid employee	F	54
1179	married	Not working - other	F	30

mar_status	married	unmarried
empl_status
Not working - disabled	72	45
Not working - looking for work	71	69
Not working - on a temporary layoff from a job	21	13
Not working - other	182	33
Not working - retired	94	11
Working as paid employee	906	347
Working, self-employed	138	66

	gender
M	0.5
F	0.5

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

Lecture 6 – Hypothesis Testing¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

Data Scope¶

Example: Wikipedia Awards¶

Example: Who will win the election?¶

🔑 Key Idea: Random samples look like the access frame they were sampled from¶

What is Hypothesis Testing? (Review from DSC 10)¶

The Basics¶

Hypothesis Testing Setup¶

Why is Hypothesis Testing Hard to Learn?¶

Example: Coin flipping¶

Coin flipping¶

Generating the null distribution¶

Generating the null distribution, using math¶

Making a decision¶

⚠️ We can't "accept" the null!¶

In our framework:¶

Generating the null distribution, using simulation¶

Generating the null distribution, using simulation¶

Visualizing the empirical distribution of the test statistic¶

In our framework:¶

Reflection¶

Can we make things faster? 🏃¶

Choosing alternative hypotheses and test statistics¶

Absolute test statistics¶

Important¶

Fun fact¶

Example: Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Generating many random samples¶

Computing many TVDs, without a for-loop¶

Visualizing the empirical distribution of the test statistic¶

Conclusion¶

In our framework:¶

Summary of the method¶

Aside¶

Discussion Question¶

The hypothesis testing "recipe"¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Hypothesis test setup¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Differences between categorical distributions¶

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Discussion Question¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

In our Framework¶

Discussion Question¶

Student-Submitted Questions¶

Computing many TVDs, without a `for`-loop¶

Understanding the `couples` dataset¶

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1