import pandas as pd
import numpy as np
import os

import seaborn as sns
import plotly.express as px
pd.options.plotting.backend = 'plotly'


baby = pd.read_csv(os.path.join('data', 'baby.csv'))
baby.head()


baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()


baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])


px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
             title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)


px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
             title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)


group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

Maternal Smoker
False    123.085315
True     113.819172
Name: Birth Weight, dtype: float64


group_means.loc[True] - group_means.loc[False]

-9.266142572024918


pd.Series([1, 2, -3]).diff()

0    NaN
1    1.0
2   -5.0
dtype: float64


group_means.diff()

Maternal Smoker
False         NaN
True    -9.266143
Name: Birth Weight, dtype: float64


group_means.diff()[-1]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3361, in Index.get_loc(self, key, method, tolerance)
   3360 try:
-> 3361     return self._engine.get_loc(casted_key)
   3362 except KeyError as err:

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:76, in pandas._libs.index.IndexEngine.get_loc()

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:108, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: -1

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Input In [11], in <cell line: 1>()
----> 1 group_means.diff()[-1]

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/series.py:942, in Series.__getitem__(self, key)
    939     return self._values[key]
    941 elif key_is_scalar:
--> 942     return self._get_value(key)
    944 if is_hashable(key):
    945     # Otherwise index.get_value will raise InvalidIndexError
    946     try:
    947         # For labels that don't resolve as scalars like tuples and frozensets

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/series.py:1051, in Series._get_value(self, label, takeable)
   1048     return self._values[label]
   1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
   1052 return self.index._get_values_for_loc(self, loc, label)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3363, in Index.get_loc(self, key, method, tolerance)
   3361         return self._engine.get_loc(casted_key)
   3362     except KeyError as err:
-> 3363         raise KeyError(key) from err
   3365 if is_scalar(key) and isna(key) and not self.hasnans:
   3366     raise KeyError(key)

KeyError: -1


# If we wanted to do (non-smokers' mean - smokers' mean). 
# Think about why this is the case (hint: it has to do with how the resulting DataFrame after grouping is sorted).
group_means[::-1].diff()[-1]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3361, in Index.get_loc(self, key, method, tolerance)
   3360 try:
-> 3361     return self._engine.get_loc(casted_key)
   3362 except KeyError as err:

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:76, in pandas._libs.index.IndexEngine.get_loc()

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:108, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: -1

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Input In [12], in <cell line: 3>()
      1 # If we wanted to do (non-smokers' mean - smokers' mean). 
      2 # Think about why this is the case (hint: it has to do with how the resulting DataFrame after grouping is sorted).
----> 3 group_means[::-1].diff()[-1]

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/series.py:942, in Series.__getitem__(self, key)
    939     return self._values[key]
    941 elif key_is_scalar:
--> 942     return self._get_value(key)
    944 if is_hashable(key):
    945     # Otherwise index.get_value will raise InvalidIndexError
    946     try:
    947         # For labels that don't resolve as scalars like tuples and frozensets

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/series.py:1051, in Series._get_value(self, label, takeable)
   1048     return self._values[label]
   1050 # Similar to Index.get_value, but we do not fall back to positional
-> 1051 loc = self.index.get_loc(label)
   1052 return self.index._get_values_for_loc(self, loc, label)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3363, in Index.get_loc(self, key, method, tolerance)
   3361         return self._engine.get_loc(casted_key)
   3362     except KeyError as err:
-> 3363         raise KeyError(key) from err
   3365 if is_scalar(key) and isna(key) and not self.hasnans:
   3366     raise KeyError(key)

KeyError: -1


baby.head()


np.random.permutation(baby['Birth Weight'])

array([102, 138, 117, ..., 141, 129, 103])


with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()


group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means


for x in ['Birth Weight', 'Shuffled_Weights']:
    fig = px.histogram(with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
                 title=f"Using the {x} column (difference in means = {group_means[x].diff().iloc[-1]})", barmode='overlay', opacity=0.7)
    fig.show()


n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, alphabetically, False comes before True,
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.diff().iloc[-1]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[1.072111766229412,
 -1.3532641650288753,
 -0.1763487057604749,
 0.07405883876471364,
 1.043493761140823,
 0.5140606670018428,
 -0.8560263266145682,
 -1.6251352133704984,
 -1.6966802260919849,
 0.5677194265429648]


observed_difference = baby.groupby('Maternal Smoker')['Birth Weight'].mean().diff().iloc[-1]
observed_difference

-9.266142572024918


fig = px.histogram(pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Mean Differences in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-15, 15])


couples_fp = os.path.join('data', 'married_couples.csv')
couples = pd.read_csv(couples_fp)
couples.head()


# What does this expression compute?
couples['hh_id'].value_counts().value_counts()

2    1033
1       2
Name: hh_id, dtype: int64


couples = couples[['mar_status', 'empl_status', 'gender', 'age']]
couples.head()


couples.head()


empl = [
    'Working as paid employee',
    'Working, self-employed',
    'Not working - on a temporary layoff from a job',
    'Not working - looking for work',
    'Not working - retired',
    'Not working - disabled',
    'Not working - other'
]


couples = couples.replace({
    'mar_status': {1: 'married', 2: 'unmarried'},
    'gender': {1: 'M', 2: 'F'},
    'empl_status': {(k + 1): empl[k] for k in range(len(empl))}
})


couples.head()


# For categorical columns, this shows the 10 most common values and their frequencies.
# For numerical columns, this shows the result of calling the .describe() method.
for col in couples:
    if couples[col].dtype == 'object':
        empr = couples[col].value_counts(normalize=True).to_frame().iloc[:10]
    else:
        empr = couples[col].describe().to_frame()
    display(empr)


px.histogram(couples, x='age', color='mar_status', histnorm='probability', marginal='box',
             barmode='overlay', opacity=0.7)


couples.sample(5).head()


# Note that this is a shortcut to picking a column for values and using aggfunc='count'.
empl_cnts = couples.pivot_table(index='empl_status', columns='mar_status', aggfunc='size')
empl_cnts


empl_cnts.sum()

mar_status
married      1484
unmarried     584
dtype: int64


cond_distr = empl_cnts / empl_cnts.sum()
cond_distr


cond_distr.plot(kind='barh', title='Distribution of Employment Status, Conditional on Household Type', barmode='group')


cond_distr


cond_distr.diff(axis=1).iloc[:, -1].abs().sum() / 2

0.1269754089281099


def tvd_of_groups(df, groups, cats):
    '''groups: the binary column (e.g. married vs. unmarried).
       cats: the categorical column (e.g. employment status).
    '''
    cnts = df.pivot_table(index=cats, columns=groups, aggfunc='size')
    # Normalize each column.
    distr = cnts / cnts.sum()
    # Compute and return the TVD.
    return distr.diff(axis=1).iloc[:, -1].abs().sum() / 2


# Same result as above.
observed_tvd = tvd_of_groups(couples, groups='mar_status', cats='empl_status')
observed_tvd

0.1269754089281099


couples.head()


couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))


N = 1000
tvds = []

for _ in range(N):
    # Shuffle marital statuses.
    with_shuffled = couples.assign(shuffled_mar=np.random.permutation(couples['mar_status']))
    
    # Compute and store the TVD.
    tvd = tvd_of_groups(with_shuffled, groups='shuffled_mar', cats='empl_status')
    tvds.append(tvd)


fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=1.15 * observed_tvd, showarrow=False, y=0.055)

fig.update_layout(xaxis_range=[0, 0.2])
p_95 = np.percentile(tvds, 95)
fig.add_vline(x=p_95, line_color='purple')
annot_text = f'<span style="color:purple">The 95th percentile of our<br>empirical distribution is {round(p_95, 2)}.<br><br>'
annot_text += 'If our observed statistic is to the<br>right of this point, we will reject the null<br>at a 5% <b>significance level</b>.</span>'
fig.add_annotation(text=annot_text, x=1.5 * np.percentile(tvds, 95), showarrow=False, y=0.05)

	age
count	2068.000000
mean	43.165377
std	11.906982
min	18.000000
25%	33.000000
50%	44.000000
75%	53.000000
max	64.000000

mar_status	married	unmarried
empl_status
Not working - disabled	0.048518	0.077055
Not working - looking for work	0.047844	0.118151
Not working - on a temporary layoff from a job	0.014151	0.022260
Not working - other	0.122642	0.056507
Not working - retired	0.063342	0.018836
Working as paid employee	0.610512	0.594178
Working, self-employed	0.092992	0.113014

mar_status	married	unmarried
empl_status
Not working - disabled	0.048518	0.077055
Not working - looking for work	0.047844	0.118151
Not working - on a temporary layoff from a job	0.014151	0.022260
Not working - other	0.122642	0.056507
Not working - retired	0.063342	0.018836
Working as paid employee	0.610512	0.594178
Working, self-employed	0.092992	0.113014

Lecture 10 – Permutation Testing¶

DSC 80, Spring 2023¶

Agenda¶

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Hypothesis test setup¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Differences between categorical distributions¶

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the `couples` dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Discussion Question¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

Discussion Question¶

Summary¶

Summary¶

Next time¶

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
3	108	282	23	67	125	True
4	136	286	25	62	93	False

	Birth Weight	Shuffled_Weights
Maternal Smoker
False	123.085315	119.632168
True	113.819172	119.198257

	hh_id	gender	mar_status	rel_rating	age	education	hh_income	empl_status	hh_internet
0	0	1	1	1	51	12	14	1	1
1	0	2	1	1	53	9	14	1	1
2	1	1	1	1	57	11	15	1	1
3	1	2	1	1	57	9	15	1	1
4	2	1	1	1	60	12	14	1	1

	empl_status
Working as paid employee	0.605899
Not working - other	0.103965
Working, self-employed	0.098646
Not working - looking for work	0.067698
Not working - disabled	0.056576
Not working - retired	0.050774
Not working - on a temporary layoff from a job	0.016441

	gender
M	0.5
F	0.5

	hh_id	gender	mar_status	rel_rating	age	education	hh_income	empl_status	hh_internet
0	0	1	1	1	51	12	14	1	1
1	0	2	1	1	53	9	14	1	1
2	1	1	1	1	57	11	15	1	1
3	1	2	1	1	57	9	15	1	1
4	2	1	1	1	60	12	14	1	1

Lecture 10 – Permutation Testing¶

DSC 80, Spring 2023¶

Agenda¶

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Hypothesis test setup¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Differences between categorical distributions¶

Example: Married vs. unmarried couples¶

Cleaning the dataset¶

Understanding the couples dataset¶

Understanding employment status in households¶

Differences in the distributions¶

Permutation test for household composition¶

Discussion Question¶

Total variation distance¶

Simulation¶

Conclusion of the test¶

Discussion Question¶

Summary¶

Summary¶

Next time¶

Understanding the `couples` dataset¶

	hh_id	gender	mar_status	rel_rating	age	education	hh_income	empl_status	hh_internet
0	0	1	1	1	51	12	14	1	1
1	0	2	1	1	53	9	14	1	1
2	1	1	1	1	57	11	15	1	1
3	1	2	1	1	57	9	15	1	1
4	2	1	1	1	60	12	14	1	1