from dsc80_utils import *

# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
    fig = ff.create_distplot(
        hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
        group_labels=[group1, group2],
        show_rug=False, show_hist=False,
        colors=['#ef553b', '#636efb'],
    )
    return fig.update_layout(title=title)

baby = pd.read_csv(Path('data') / 'babyweights.csv')
baby

baby = baby[['Maternal Smoker', 'Birth Weight']]
baby.head()

baby.groupby('Maternal Smoker')['Birth Weight'].agg(['mean', 'count'])

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

fig = px.histogram(baby, color='Maternal Smoker', histnorm='probability', marginal='box', 
                   title="Birth Weight by Mother's Smoking Status", barmode='overlay', opacity=0.7)
fig

group_means = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
group_means

Maternal Smoker
False    123.09
True     113.82
Name: Birth Weight, dtype: float64

group_means.loc[True] - group_means.loc[False]

np.float64(-9.266142572024918)

baby.head()

np.random.permutation(baby['Birth Weight'])

array([ 90, 117, 121, ..., 125, 114,  98])

with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))
with_shuffled.head()

group_means = with_shuffled.groupby('Maternal Smoker').mean()
group_means

for x in ['Birth Weight', 'Shuffled_Weights']:
    diff = group_means.loc[True, x] - group_means.loc[False, x]
    fig = px.histogram(
        with_shuffled, x=x, color='Maternal Smoker', histnorm='probability', marginal='box', 
        title=f"Using the {x} column <br>(difference in means = {diff:.2f})",
        barmode='overlay', opacity=0.7)
    fig.update_layout(margin=dict(t=60))
    fig.show()

n_repetitions = 500

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, False (0) comes before True (1),
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.loc[True] - group_means.loc[False]
    
    # Step 4: Store the result
    differences.append(difference)
    
differences[:10]

[np.float64(-0.8703353291588627),
 np.float64(2.4278897573015144),
 np.float64(-0.25147096911803146),
 np.float64(1.1686975334034173),
 np.float64(0.44967015555249645),
 np.float64(-0.4196017490135233),
 np.float64(-0.680741045446922),
 np.float64(-2.5838383838383834),
 np.float64(-0.5233420174596546),
 np.float64(1.5299998476469057)]

mean_weights = baby.groupby('Maternal Smoker')['Birth Weight'].mean()
observed_difference = mean_weights[True] - mean_weights[False]
observed_difference

np.float64(-9.266142572024918)

fig = px.histogram(
    pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
    title='Empirical Distribution of the Mean Differences <br> in Birth Weights (Smoker - Non-Smoker)')
fig.add_vline(x=observed_difference, line_color='red')
fig.update_layout(xaxis_range=[-10, 10], margin=dict(t=60))

heights_path = Path('data') / 'midparent.csv'
heights = (pd.read_csv(heights_path)
           .rename(columns={'childHeight': 'child'})
           [['father', 'mother', 'gender', 'child']])
heights.head()

heights.isna().sum()

father    0
mother    0
gender    0
child     0
dtype: int64

fig = px.scatter_matrix(heights.drop(columns=['gender']))
fig

np.random.seed(42) # So that we get the same results each time (for lecture).

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.nan

heights_mcar.head(10)

heights_mcar.isna().mean()

father    0.0
mother    0.0
gender    0.0
child     0.3
dtype: float64

heights_mcar['child_missing'] = heights_mcar['child'].isna()
heights_mcar.head()

gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')

n_repetitions = 500
shuffled = heights_mcar.copy()

tvds = []
for _ in range(n_repetitions):
    
    # Shuffling genders. 
    # Note that we are assigning back to the same DataFrame for performance reasons; 
    # see https://dsc80.com/resources/lectures/lec07/lec07-fast-permutation-tests.html.
    shuffled['gender'] = np.random.permutation(shuffled['gender'])
    
    # Computing and storing the TVD.
    pivoted = (
        shuffled
        .pivot_table(index='gender', columns='child_missing', aggfunc='size')
    )
    
    pivoted = pivoted / pivoted.sum()
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)

observed_tvd = gender_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

np.float64(0.009196155526430771)

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red', line_width=2, opacity=1)
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=2.5 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

(np.array(tvds) >= observed_tvd).mean()

np.float64(0.838)

px.histogram(heights_mcar, x='father', color='child_missing', histnorm='probability', marginal='box',
             title="Father's Height by Missingness of Child Height", barmode='overlay', opacity=0.7)

create_kde_plotly(heights_mcar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height")

Phone	Screen Size	Price
iPhone 15	6.06	999
Galaxy Z Fold5	7.6	NaN
OnePlus 12R	6.7	499
iPhone 14 Pro Max	6.68	NaN

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	69.0
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
...	...	...	...	...
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	father	mother	gender	child	child_missing
0	78.5	67.0	male	73.2	False
1	78.5	67.0	female	69.2	False
2	78.5	67.0	female	NaN	True
3	78.5	67.0	female	69.0	False
4	75.5	66.5	male	73.5	False

	child_missing = False	child_missing = True
gender
female	0.49	0.48
male	0.51	0.52

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
...	...	...	...	...	...	...
1171	130	291	30	65	150	True
1172	125	281	21	65	110	False
1173	117	297	38	65	129	False

	Maternal Smoker	Birth Weight	Shuffled_Weights
0	False	120	117
1	False	113	109
2	True	128	114
3	True	108	144
4	False	136	116

Lecture 7 – Missingness Mechanisms¶

DSC 80, Fall 2024¶

Announcements 📣¶

Agenda 📆¶

Additional resources¶

Permutation testing¶

Hypothesis testing vs. permutation testing¶

Hypothesis testing vs. permutation testing¶

Example: Birth weight and smoking 🚬¶

Exploratory data analysis¶

Visualizing birth weight distributions¶

Null hypothesis: birth weights come from the same distribution¶

Alternative hypothesis: birth weights come from different distributions¶

Choosing a test statistic¶

Difference in group means¶

Hypothesis test setup¶

Implications of the null hypothesis¶

Permutation tests¶

Shuffling¶

How close are the means of the shuffled groups?¶

Simulating the empirical distribution of the test statistic¶

Conclusion of the test¶

⚠️ Caution!¶

Hypothesis testing vs. permutation testing¶

Question 🤔 (Answer at dsc80.com/q)

Make sure to check out the "Permutation testing meets TVD" section from last lecture. It has one more example of using a permutation test that you should be familiar with.

Missingness mechanisms¶

Imperfect data¶

Imperfect data¶

Types of missingness¶

Missing by design (MD)¶

Missing by design¶

Other types of missingness¶

Mom... the dog ate my data! 🐶¶

Question 🤔 (Answer at dsc80.com/q)

The real world is messy! 🌎¶

Not missing at random (NMAR)¶

Missing completely at random (MCAR)¶

Missing at random (MAR)¶

Isn't everything NMAR? 🤔¶

Flowchart¶

Question 🤔 (Answer at dsc80.com/q)

Why do we care again?¶

Question 🤔 (Answer at dsc80.com/q)

Formal definitions¶

A diagram (Sam will annotate)¶

Identifying missingness mechanisms in data¶

Identifying missingness mechanisms in data¶

Assessing NMAR¶

Assessing MAR¶

Deciding between MCAR and MAR¶

Deciding between MCAR and MAR¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Comparing null and non-null 'child' distributions for 'gender'¶

Comparing null and non-null 'child' distributions for 'gender'¶

Simulation¶

Results¶

Comparing null and non-null 'child' distributions for 'father'¶

Concluding that 'child' is MCAR¶

Summary, next time¶

Summary¶

Next time¶

Verifying that child heights are MCAR in `heights_mcar`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'father'`¶

Concluding that `'child'` is MCAR¶