from dsc80_utils import *

names = np.load(Path('data') / 'names.npy', allow_pickle=True)

# By default, the sampling is done WITH replacement.
np.random.choice(names, 10)

array(['Lin', 'Yeogyeong', 'Subika', 'Jesse', 'Aakash', 'Chenlong',
       'David', 'Ethan', 'Seanna', 'Ailinna'], dtype=object)

# To sample WITHOUT replacement, set replace=False.
# This is known as "simple random sampling."
np.random.choice(names, 10, replace=False)

array(['Kening', 'Ethan', 'Diya', 'Seanna', 'Colin', 'Zening',
       'Stephanie', 'Jiaye', 'Dylan', 'Jessica'], dtype=object)

# Samples WITHOUT replacement by default (the opposite of np.random.choice).
pd.DataFrame(names, columns=['name']).sample(10)

# Draws 100 elements from a population in which 50% are group 0 and 50% are group 1.
# This sampling is done WITH replacement.
# In other words, each sampled element has a 50% chance of being group 0 and a 50% chance of being group 1.
np.random.multinomial(100, [0.5, 0.5])

array([50, 50])

eth = pd.DataFrame(
    [['Asian', 0.15, 0.51],
     ['Black', 0.05, 0.02],
     ['Latino', 0.39, 0.16],
     ['White', 0.35, 0.2],
     ['Other', 0.06, 0.11]],
    columns=['Ethnicity', 'California', 'UCSD']
).set_index('Ethnicity')

eth

eth.plot(kind='barh', title='Ethnic Distribution of California and UCSD', barmode='group')

def tvd(dist1, dist2):
    return np.abs(dist1 - dist2).sum() / 2

# The diff method finds the differences of consecutive elements in a Series.
pd.Series([4, 5, -2]).diff()

0    NaN
1    1.0
2   -7.0
dtype: float64

observed_tvd = eth.diff(axis=1).abs().sum().iloc[1] / 2
observed_tvd

0.41000000000000003

# Number of students at UCSD in this example.
N_STUDENTS = 30_000

eth['California']

Ethnicity
Asian     0.15
Black     0.05
Latino    0.39
White     0.35
Other     0.06
Name: California, dtype: float64

np.random.multinomial(N_STUDENTS, eth['California'])

array([ 4446,  1517, 11713, 10613,  1711])

np.random.multinomial(N_STUDENTS, eth['California']) / N_STUDENTS

array([0.15, 0.05, 0.39, 0.35, 0.06])

eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
eth_draws

array([[0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       ...,
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06],
       [0.15, 0.05, 0.39, 0.35, 0.06]])

eth_draws.shape

(100000, 5)

# The values here appear rounded.
tvds = np.abs(eth_draws - eth['California'].to_numpy()).sum(axis=1) / 2
tvds

array([0., 0., 0., ..., 0., 0., 0.])

observed_tvd

0.41000000000000003

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=20, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig

(np.array(tvds) >= observed_tvd).mean()

0.0

eth

With 2 students, the p-value is 0.72779.
With 4 students, the p-value is 0.3063.
With 8 students, the p-value is 0.08288.
With 16 students, the p-value is 0.00365.
With 32 students, the p-value is 2e-05.
With 64 students, the p-value is 0.0.
With 128 students, the p-value is 0.0.
With 256 students, the p-value is 0.0.

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

-9.266142572024918

array([144, 113, 115, ..., 109,  94, 132])

[0.8538994774288824,
 0.8538994774288824,
 -0.2872434754787747,
 0.6857686975334047,
 -0.7415543062601841,
 0.7430047077105826,
 0.07048158812864358,
 0.8646312293371068,
 -0.8166765696177407,
 0.4782881606410996]

-9.266142572024918

2    1033
1       2
Name: hh_id, dtype: int64

mar_status
married      1484
unmarried     584
dtype: int64

0.1269754089281099

0.1269754089281099

eth

def ethnicity_test(N_STUDENTS):
    eth_draws = np.random.multinomial(N_STUDENTS, eth['California'], size=100_000) / N_STUDENTS
    tvds = np.sum(np.abs(eth_draws - eth['California'].to_numpy()), axis=1) / 2
    return (np.array(tvds) >= observed_tvd).mean()

for i in range(1, 9):
    N_STUDENTS = 2 ** i
    print(f'With {N_STUDENTS} students, the p-value is {ethnicity_test(N_STUDENTS)}.')

With 2 students, the p-value is 0.72779.
With 4 students, the p-value is 0.3063.
With 8 students, the p-value is 0.08288.
With 16 students, the p-value is 0.00365.
With 32 students, the p-value is 2e-05.
With 64 students, the p-value is 0.0.
With 128 students, the p-value is 0.0.
With 256 students, the p-value is 0.0.

baby = pd.read_csv(Path('data') / 'babyweights.csv')
baby

baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()

baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

group_means.loc[True] - group_means.loc[False]

-9.266142572024918

baby.head()

np.random.permutation(baby['Birth Weight'])

array([144, 113, 115, ..., 109,  94, 132])

with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()

group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means

for x in ['Birth Weight', 'Shuffled_Weights']:
    diff = group_means.loc[True, x] - group_means.loc[False, x]
    fig = px.histogram(
        with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
        title=f"Using the {x} column <br>(difference in means = {diff:.2f})",
        barmode='overlay', opacity=0.7)
    fig.update_layout(margin=dict(t=60))
    fig.show()

n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, False (0) comes before True (1),
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.loc[True] - group_means.loc[False]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[0.8538994774288824,
 0.8538994774288824,
 -0.2872434754787747,
 0.6857686975334047,
 -0.7415543062601841,
 0.7430047077105826,
 0.07048158812864358,
 0.8646312293371068,
 -0.8166765696177407,
 0.4782881606410996]

mean_weights = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
observed_difference = mean_weights[True] - mean_weights[False]
observed_difference

-9.266142572024918

fig = px.histogram(
    pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
    title='Empirical Distribution of the Mean Differences <br> in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-10, 10], margin=dict(t=60))

couples = pd.read_csv(Path('data') / 'married_couples.csv')
couples.head()

# What does this expression compute?
couples['hh_id'].value_counts().value_counts()

2    1033
1       2
Name: hh_id, dtype: int64

couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()

couples.head()

empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]

couples = couples.replace({
    'mar_status': {1: 'married', 2: 'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})

couples.head()

# For categorical columns, this shows the 10 most common values and their frequencies.
# For numerical columns, this shows the result of calling the .describe() method.
for col in couples:
    if couples[col].dtype == 'object':
        empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    else:
        empr = couples[col].describe().to_frame()
    display(empr)

px.histogram(couples, x='age', color='mar_status', histnorm='probability', marginal='box',
             barmode='overlay', opacity=0.7)

couples.sample(5).head()

# Note that this is a shortcut to picking a column for values and using aggfunc='count'.
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts

empl_cnts.sum()

mar_status
married      1484
unmarried     584
dtype: int64

cond_distr = empl_cnts / empl_cnts.sum()
cond_distr

cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type', barmode='group')

cond_distr

(cond_distr['unmarried'] - cond_distr['married']).abs().sum() / 2

0.1269754089281099

def tvd_of_groups(df, groups, cats):
    '''groups: the binary column (e.g. married vs. unmarried).
       cats: the categorical column (e.g. employment status).
    '''
    cnts = df.pivot_table(index=cats, columns=groups, aggfunc='size')
    # Normalize each column.
    distr = cnts / cnts.sum()
    # Compute and return the TVD.
    return (distr['unmarried'] - distr['married']).abs().sum() / 2

# Same result as above.
observed_tvd = tvd_of_groups(couples, groups='mar_status', cats='empl_status')
observed_tvd

0.1269754089281099

couples.head()

couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))

N = 1000
tvds = []

for _ in range(N):
    # Shuffle marital statuses.
    with_shuffled = couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))
    
    # Compute and store the TVD.
    tvd = tvd_of_groups(with_shuffled, groups='shuffled_mar', cats='empl_status')
    tvds.append(tvd)

fig = px.histogram(tvds, x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.update_layout(xaxis_range=[0, 0.2])

	age
count	2068.00
mean	43.17
std	11.91
...	...
50%	44.00
75%	53.00
max	64.00

	California	UCSD
Ethnicity
Asian	0.15	0.51
Black	0.05	0.02
Latino	0.39	0.16
White	0.35	0.20
Other	0.06	0.11

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
...	...	...	...	...	...	...
1171	130	291	30	65	150	True
1172	125	281	21	65	110	False
1173	117	297	38	65	129	False

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

	mar_status	empl_status	gender	age
0	married	Working as paid employee	M	51
1	married	Working as paid employee	F	53
2	married	Working as paid employee	M	57
3	married	Working as paid employee	F	57
4	married	Working as paid employee	M	60

	empl_status
Working as paid employee	0.61
Not working - other	0.10
Working, self-employed	0.10
Not working - looking for work	0.07
Not working - disabled	0.06
Not working - retired	0.05
Not working - on a temporary layoff from a job	0.02

	mar_status	empl_status	gender	age
181	married	Working as paid employee	F	60
1887	unmarried	Working as paid employee	F	27
745	married	Working as paid employee	F	39
1916	unmarried	Working, self-employed	M	36
1241	married	Working as paid employee	F	44

mar_status	married	unmarried
empl_status
Not working - disabled	72	45
Not working - looking for work	71	69
Not working - on a temporary layoff from a job	21	13
Not working - other	182	33
Not working - retired	94	11
Working as paid employee	906	347
Working, self-employed	138	66

	name
105	Yiran
48	Kailey
76	Niha
...	...
21	David
24	Diego
26	Dylan

	gender
M	0.5
F	0.5

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1

Lecture 6 – Hypothesis Testing¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda 📆¶

Why are we learning hypothesis testing again?¶

Data scope¶

Where are we in the data science lifecycle?¶

Data scope¶

Example: Wikipedia awards¶

Example: Who will win the election?¶

🔑 Key Idea: Random samples look like the access frame they were sampled from!¶

Sampling in practice¶

Overview of hypothesis testing¶

What problem does hypothesis testing solve?¶

Why hypothesis testing is difficult to learn¶

The hypothesis testing "recipe"¶

Question 🤔 (Answer at q.dsc80.com)

Example: Total variation distance¶

Ethnic distribution of California vs. UCSD¶

Is the difference between the two distributions significant?¶

Total variation distance¶

The plan¶

Generating one random sample¶

Generating many random samples and computing TVDs, without a for-loop¶

Visualizing the empirical distribution of the test statistic¶

Conclusion¶

Summary of the method¶

Aside¶

Question 🤔 (Answer at q.dsc80.com)

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Hypothesis testing vs. permutation testing¶

Question 🤔 (Answer at q.dsc80.com)

Question 🤔 (Answer at q.dsc80.com)

Permutation testing meets TVD¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it. You can also watch this podcast, starting from 4:43 for a walkthrough.

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

Summary, next time¶

Summary¶

Next time¶

Generating many random samples and computing TVDs, without a `for`-loop¶

Note: This section has another hypothesis testing example. We might not have time to cover the example in lecture, but you should understand it.

You can also watch this podcast, starting from 4:43 for a walkthrough.

Understanding the `couples` dataset¶

	hh_id	gender	mar_status	rel_rating	...	education	hh_income	empl_status	hh_internet
0	0	1	1	1	...	12	14	1	1
1	0	2	1	1	...	9	14	1	1
2	1	1	1	1	...	11	15	1	1
3	1	2	1	1	...	9	15	1	1
4	2	1	1	1	...	12	14	1	1