import numpy as np
import pandas as pd
import pathlib
import os

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(10, 5))
plt.rc('font', size=12)


# Kaiser dataset, 70s 
baby_fp = os.path.join('data', 'baby.csv')
baby = pd.read_csv(baby_fp)
baby.head()


smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
smoking_and_birthweight.head()


smoking_and_birthweight.groupby('Maternal Smoker').count()


smoking_and_birthweight.groupby('Maternal Smoker').mean()


title = "Birth Weight by Mother's Smoking Status"

(
    smoking_and_birthweight
    .groupby('Maternal Smoker')['Birth Weight']
    .plot(kind='hist', density=True, legend=True,
          ec='w', bins=np.arange(50, 200, 5), alpha=0.75,
          title=title)
);


(
    smoking_and_birthweight
    .groupby('Maternal Smoker')['Birth Weight']
    .plot(kind='kde', legend=True,
          title=title)
);


(
    smoking_and_birthweight
    .groupby('Maternal Smoker')['Birth Weight']
    .plot(kind='kde', legend=True,
          title=title,
          figsize=(4, 3))
);


smoking_and_birthweight.head()


means_table = smoking_and_birthweight.groupby('Maternal Smoker').mean()
means_table


means_table.loc[True, 'Birth Weight'] - means_table.loc[False, 'Birth Weight']

-9.266142572024918


s = pd.Series([1, 2, 5, 9, 15])
s

0     1
1     2
2     5
3     9
4    15
dtype: int64


s.diff()

0    NaN
1    1.0
2    3.0
3    4.0
4    6.0
dtype: float64


means_table


means_table.diff()


observed_difference = means_table.diff().iloc[-1, 0]
observed_difference

-9.266142572024918


smoking_and_birthweight.head()


smoking_and_birthweight.sample(frac=1)


smoking_and_birthweight.head()


shuffled_weights = (
    smoking_and_birthweight['Birth Weight']
    .sample(frac=1)
    .reset_index(drop=True) # Question: What will happen if we do not reset the index?
)

shuffled_weights.head()

0     86
1    130
2    123
3    115
4    119
Name: Birth Weight, dtype: int64


original_and_shuffled = (
    smoking_and_birthweight
    .assign(**{'Shuffled Birth Weight': shuffled_weights})
)

original_and_shuffled.head(10)


original_and_shuffled.groupby('Maternal Smoker').mean()


fig, axes = plt.subplots(1, 2, figsize=(15, 5))

title = 'Birth Weights by Maternal Smoker, SHUFFLED'
original_and_shuffled.groupby('Maternal Smoker')['Shuffled Birth Weight'].plot(kind='kde', title=title, ax=axes[0])

title = 'Birth Weights by Maternal Smoker'
original_and_shuffled.groupby('Maternal Smoker')['Birth Weight'].plot(kind='kde', title=title, ax=axes[1]);


n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights
    shuffled_weights = (
        smoking_and_birthweight['Birth Weight']
        .sample(frac=1)
        .reset_index(drop=True) # Be sure to reset the index! (Why?)
    )
    
    # Step 2: Put them in a DataFrame
    shuffled = (
        smoking_and_birthweight
        .assign(**{'Shuffled Birth Weight': shuffled_weights})
    )
    
    # Step 3: Compute the test statistic
    group_means = (
        shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled Birth Weight']
    )
    difference = group_means.diff().iloc[-1]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[-0.03683593095357196,
 0.6106464341758482,
 -0.5090330149153601,
 -2.6160336395630566,
 -1.90058351234822,
 0.7644682115270456,
 -0.2478937184819614,
 -0.5483827719121876,
 -0.8667580785227926,
 0.5069061657297027]


observed_difference = (
    smoking_and_birthweight
    .groupby('Maternal Smoker')['Birth Weight']
    .mean()
    .diff()
    .iloc[-1]
)

observed_difference

-9.266142572024918


title = 'Mean Differences in Birth Weights (Smoker - Non-Smoker)'
pd.Series(differences).plot(kind='hist', density=True, ec='w', bins=10, title=title)
plt.axvline(x=observed_difference, color='red', linewidth=3);


couples_fp = os.path.join('data', 'married_couples.csv')
couples = pd.read_csv(couples_fp)


couples.head()


couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()


couples.head()


empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]


couples = couples.replace({
    'mar_status': {1:'married', 2:'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})


couples.head()


# This cell shows the top 10 most common values in each column, along with their frequencies.
for col in couples:
    print(col)
    empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    display(empr)

mar_status

empl_status

gender

age


couples['age'].plot(kind='hist', density=True, ec='w', bins=np.arange(18, 66));


G = couples.groupby('mar_status')
ax = G.get_group('married')['age'].rename('Married').plot(kind='hist', density=True, alpha=0.75, 
                                                          ec='w', bins=np.arange(18, 66, 2),
                                                          legend=True, title='Distribution of Ages (Married vs. Unmarried)')
G.get_group('unmarried')['age'].rename('Unmarried').plot(kind='hist', density=True, alpha=0.75, 
                                                       ec='w', bins=np.arange(18, 66, 2),
                                                       ax=ax, legend=True);


couples.head()


# Note that this is a shortcut to picking a column for values and using aggfunc='count'
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts


empl_cnts.sum()

mar_status
married      1484
unmarried     584
dtype: int64


cond_distr = empl_cnts / empl_cnts.sum()
cond_distr


cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type');


cond_distr


cond_distr.diff(axis=1).iloc[:, -1].abs().sum() / 2

0.1269754089281099


couples.head()


def tvd_of_groups(df):
    cnts = df.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
    distr = cnts / cnts.sum()   # Normalized
    return distr.diff(axis=1).iloc[:, -1].abs().sum() / 2  # TVD


# Same result as above
observed_tvd = tvd_of_groups(couples)
observed_tvd

0.1269754089281099


couples.head()


s = couples['mar_status'].sample(frac=1).reset_index(drop=True)
s

0       unmarried
1         married
2         married
3         married
4         married
          ...    
2063      married
2064      married
2065      married
2066      married
2067    unmarried
Name: mar_status, Length: 2068, dtype: object


shuffled = couples.loc[:, ['empl_status']].assign(mar_status=s)
shuffled


tvd_of_groups(shuffled)

0.06320385481667465


N = 1000
tvds = []

for _ in range(N):
    
    s = couples['mar_status'].sample(frac=1).reset_index(drop=True)
    shuffled = couples.loc[:, ['empl_status']].assign(mar_status=s)
    
    tvds.append(tvd_of_groups(shuffled))

tvds = pd.Series(tvds)


pval = (tvds >= observed_tvd).sum() / N
tvds.plot(kind='hist', density=True, ec='w', bins=20, title=f'P-value: {pval}', label='Simulated TVDs')
plt.axvline(x=observed_tvd, color='red', linewidth=3, label='P-value')

perc = np.percentile(tvds, 95) # 5% significance level
plt.axvline(x=perc, color='y', linewidth=3, label='P-value cutoff')

plt.legend();


couples.head()


couples['empl_status'].value_counts()

Working as paid employee                          1253
Not working - other                                215
Working, self-employed                             204
Not working - looking for work                     140
Not working - disabled                             117
Not working - retired                              105
Not working - on a temporary layoff from a job      34
Name: empl_status, dtype: int64


not_work_no_choice = couples['empl_status'].isin(['Not working - looking for work', 'Not working - disabled'])
not_work_no_choice

0       False
1       False
2       False
3       False
4       False
        ...  
2063    False
2064    False
2065    False
2066    False
2067    False
Name: empl_status, Length: 2068, dtype: bool


couples['not_work_no_choice'] = not_work_no_choice.replace({True: 1, False: 0})
couples.head(12)


couples.groupby('mar_status')['not_work_no_choice'].mean()

mar_status
married      0.096361
unmarried    0.195205
Name: not_work_no_choice, dtype: float64


obs_mean = couples.groupby('mar_status')['not_work_no_choice'].mean().diff().iloc[-1]
obs_mean

0.0988442934682273


N = 1000
means = []

for _ in range(N):
    
    s = couples['mar_status'].sample(frac=1).reset_index(drop=True)
    shuffled = couples.loc[:, ['not_work_no_choice']].assign(mar_status=s)

    m = shuffled.groupby('mar_status')['not_work_no_choice'].mean().diff().iloc[-1]
    
    means.append(m)

means = pd.Series(means)


pval = (means >= obs_mean).sum() / N
means.plot(kind='hist', density=True, ec='w', bins=20, title=f'P-value: {pval}', label='Simulated Differences in Means')
plt.axvline(x=obs_mean, color='red', linewidth=3, label='P-value')

perc = np.percentile(means, 95) # 5% significance level
plt.axvline(x=perc, color='y', linewidth=3, label='P-value cutoff')

plt.legend();

	age
53	0.037234
55	0.036750
54	0.031431
40	0.030464
44	0.029981
30	0.028046
48	0.027563
49	0.027079
52	0.027079
43	0.026596

mar_status	married	unmarried
empl_status
Not working - disabled	0.048518	0.077055
Not working - looking for work	0.047844	0.118151
Not working - on a temporary layoff from a job	0.014151	0.022260
Not working - other	0.122642	0.056507
Not working - retired	0.063342	0.018836
Working as paid employee	0.610512	0.594178
Working, self-employed	0.092992	0.113014

mar_status	married	unmarried
empl_status
Not working - disabled	0.048518	0.077055
Not working - looking for work	0.047844	0.118151
Not working - on a temporary layoff from a job	0.014151	0.022260
Not working - other	0.122642	0.056507
Not working - retired	0.063342	0.018836
Working as paid employee	0.610512	0.594178
Working, self-employed	0.092992	0.113014

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
3	108	282	23	67	125	True
4	136	286	25	62	93	False

	Birth Weight	Shuffled Birth Weight
Maternal Smoker
False	123.085315	119.995804
True	113.819172	118.631808

	hh_id	gender	mar_status	rel_rating	age	education	hh_income	empl_status	hh_internet
0	0	1	1	1	51	12	14	1	1
1	0	2	1	1	53	9	14	1	1
2	1	1	1	1	57	11	15	1	1
3	1	2	1	1	57	9	15	1	1
4	2	1	1	1	60	12	14	1	1

	empl_status
Working as paid employee	0.605899
Not working - other	0.103965
Working, self-employed	0.098646
Not working - looking for work	0.067698
Not working - disabled	0.056576
Not working - retired	0.050774
Not working - on a temporary layoff from a job	0.016441

	gender
M	0.5
F	0.5

	hh_id	gender	mar_status	rel_rating	age	education	hh_income	empl_status	hh_internet
0	0	1	1	1	51	12	14	1	1
1	0	2	1	1	53	9	14	1	1
2	1	1	1	1	57	11	15	1	1
3	1	2	1	1	57	9	15	1	1
4	2	1	1	1	60	12	14	1	1

Lecture 10 – Permutation Testing¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Overview¶

Review: Hypothesis testing¶

Examples so far¶

Today's lecture¶

Permutation testing¶

Example: Birth weight and smoking 🚬¶

Birth weight and smoking¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

The setup¶

Alternative hypothesis: birth weights come from different distributions...¶

Null hypothesis: birth weights come from the same distribution¶

Choosing a test statistic¶

Difference in group means¶

Another approach¶

Testing through simulation¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

Shuffling just one column¶

A single shuffle¶

How close are the means of the shuffled groups?¶

Simulation¶

Conclusion of the test¶

⚠️ Caution!¶

Differences between categorical distributions¶

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Discussion Question¶

Total variation distance¶

Simulation¶

Results¶

Conclusion: household composition¶

Discussion Question¶

An alternative investigation¶

Simulation¶

Results¶

Conclusion: Household composition; not working, not by choice¶

Summary, next time¶

Summary¶

Understanding the `couples` dataset¶

	hh_id	gender	mar_status	rel_rating	age	education	hh_income	empl_status	hh_internet
0	0	1	1	1	51	12	14	1	1
1	0	2	1	1	53	9	14	1	1
2	1	1	1	1	57	11	15	1	1
3	1	2	1	1	57	9	15	1	1
4	2	1	1	1	60	12	14	1	1