from dsc80_utils import *

# Used for plotting examples.
def create_kde_plotly(df, group_col, group1, group2, vals_col, title=''):
    fig = ff.create_distplot(
        hist_data=[df.loc[df[group_col] == group1, vals_col], df.loc[df[group_col] == group2, vals_col]],
        group_labels=[group1, group2],
        show_rug=False, show_hist=False,
        colors=['#ef553b', '#636efb'],
    )
    return fig.update_layout(title=title)

heights = pd.read_csv('data/midparent.csv')
heights = heights.rename(columns={'childHeight': 'child'})
heights = heights[['father', 'mother', 'gender', 'child']]
heights.head()

heights.isna().sum()

father    0
mother    0
gender    0
child     0
dtype: int64

fig = px.scatter_matrix(heights.drop(columns=['gender']))
fig

np.random.seed(42) # So that we get the same results each time (for lecture).

heights_mcar = heights.copy()
idx = heights_mcar.sample(frac=0.3).index
heights_mcar.loc[idx, 'child'] = np.NaN

heights_mcar.head(10)

heights_mcar.isna().mean()

father    0.0
mother    0.0
gender    0.0
child     0.3
dtype: float64

heights_mcar['child_missing'] = heights_mcar['child'].isna()
heights_mcar.head()

gender_dist = (
    heights_mcar
    .assign(child_missing=heights_mcar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')

n_repetitions = 500
shuffled = heights_mcar.copy()

tvds = []
for _ in range(n_repetitions):
    
    # Shuffling genders. 
    # Note that we are assigning back to the same DataFrame for performance reasons; 
    # see https://dsc80.com/resources/lectures/lec11/lec11-fast-permutation-tests.html.
    shuffled['gender'] = np.random.permutation(shuffled['gender'])
    
    # Computing and storing the TVD.
    pivoted = (
        shuffled
        .pivot_table(index='gender', columns='child_missing', aggfunc='size')
        .apply(lambda x: x / x.sum())
    )
    
    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds.append(tvd)

observed_tvd = gender_dist.diff(axis=1).iloc[:, -1].abs().sum() / 2
observed_tvd

0.009196155526430771

fig = px.histogram(pd.DataFrame(tvds), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the TVD')
fig.add_vline(x=observed_tvd, line_color='red')
fig.add_annotation(text=f'<span style="color:red">Observed TVD = {round(observed_tvd, 2)}</span>',
                   x=2.3 * observed_tvd, showarrow=False, y=0.16)
fig.update_layout(yaxis_range=[0, 0.2])

np.mean(np.array(tvds) >= observed_tvd)

0.838

px.histogram(heights_mcar, x='father', color='child_missing', histnorm='probability', marginal='box',
             title="Father's Height by Missingness of Child Height", barmode='overlay', opacity=0.7)

create_kde_plotly(heights_mcar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height")

np.random.seed(42) # So that we get the same results each time (for lecture).

def make_missing(r):
    rand = np.random.uniform() # Random real number between 0 and 1.
    if r['father'] > 72 and rand < 0.5:
        return np.NaN
    elif r['gender'] == 'female' and rand < 0.3:
        return np.NaN
    else:
        return r['child']
    
heights_mar = heights.copy()
heights_mar['child'] = heights_mar.apply(make_missing, axis=1)
heights_mar['child_missing'] = heights_mar['child'].isna()

heights_mar.head()

gender_dist = (
    heights_mar
    .assign(child_missing=heights_mar['child'].isna())
    .pivot_table(index='gender', columns='child_missing', aggfunc='size')
)

# Added just to make the resulting pivot table easier to read.
gender_dist.columns = ['child_missing = False', 'child_missing = True']

gender_dist = gender_dist / gender_dist.sum()
gender_dist

gender_dist.plot(kind='barh', title='Gender by Missingness of Child Height', barmode='group')

create_kde_plotly(heights_mar, 'child_missing', True, False, 'father', 
                  "Father's Height by Missingness of Child Height")

heights_mar.groupby('child_missing')['father'].mean().diff().iloc[-1]

1.0055466604787853

Phone	Screen Size	Price
iPhone 14	6.06	999
Galaxy Z Fold 4	7.6	NaN
OnePlus 9 Pro	6.7	799
iPhone 13 Pro Max	6.68	NaN

Phone	Screen Size	Price
iPhone 14	6.06	999
Galaxy Z Fold 4	7.6	NaN
OnePlus 9 Pro	6.7	799
iPhone 13 Pro Max	6.68	NaN

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	69.0
3	78.5	67.0	female	69.0
4	75.5	66.5	male	73.5

	father	mother	gender	child
0	78.5	67.0	male	73.2
1	78.5	67.0	female	69.2
2	78.5	67.0	female	NaN
...	...	...	...	...
7	75.5	66.5	female	NaN
8	75.0	64.0	male	71.0
9	75.0	64.0	female	68.0

	child_missing = False	child_missing = True
gender
female	0.49	0.48
male	0.51	0.52

	child_missing = False	child_missing = True
gender
female	0.4	0.88
male	0.6	0.12

Lecture 7 – Missingness Mechanisms¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📝 Midterm Exam¶

📆 Agenda¶

Review: Hypothesis testing¶

Missingness mechanisms¶

Imperfect data¶

Imperfect data¶

Types of missingness¶

Missing by design (MD)¶

Missing by design¶

Other types of missingness¶

Mom... the dog ate my data! 🐶¶

Discussion Question¶

Discussion Question, solved¶

The real world is messy! 🌎¶

Not missing at random (NMAR)¶

Missing completely at random (MCAR)¶

Missing at random (MAR)¶

Isn't everything NMAR? 🤔¶

Flowchart¶

Discussion Question¶

Why do we care again?¶

Formal definitions¶

Formal definition: MCAR¶

Formal definition: MAR¶

Formal definition: NMAR¶

Assessing missingness through data¶

Assessing missingness through data¶

Assessing NMAR¶

Assessing MAR¶

Deciding between MCAR and MAR¶

Discussion Question¶

Why do we care again?¶

Formal definitions¶

Formal definition: MCAR¶

Formal definition: MAR¶

Formal definition: NMAR¶

Assessing missingness through data¶

Assessing missingness through data¶

Assessing NMAR¶

Assessing MAR¶

Deciding between MCAR and MAR¶

Deciding between MCAR and MAR¶

Example: Heights¶

Simulating MCAR data¶

Verifying that child heights are MCAR in heights_mcar¶

Comparing null and non-null 'child' distributions for 'gender'¶

Comparing null and non-null 'child' distributions for 'gender'¶

Simulation¶

Results¶

Comparing null and non-null 'child' distributions for 'father'¶

Concluding that 'child' is MCAR¶

Simulating MAR data¶

Comparing null and non-null 'child' distributions for 'gender', again¶

Comparing null and non-null 'child' distributions for 'father', again¶

Questions?¶

Next Time¶

Verifying that child heights are MCAR in `heights_mcar`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'gender'`¶

Comparing null and non-null `'child'` distributions for `'father'`¶

Concluding that `'child'` is MCAR¶

Comparing null and non-null `'child'` distributions for `'gender'`, again¶

Comparing null and non-null `'child'` distributions for `'father'`, again¶